def main(): # get params params = get_params() project = params['project'] # define file paths INPUT_FILE = join(project, 'data', 'postprocessed', 'KMERPHENO.txt') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') fsa_file = join(project, 'data', 'postprocessed', 'scored_kmers.fsa') kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') scored_kmers_file = join(project, 'data', 'postprocessed', 'scored_kmers.txt') outdir = join(project, 'data', 'postprocessed') # create output files if they do not exist if file_exists(fsa_file): fsa_file = None if file_exists(scored_kmers_file): scored_kmers_file = None if fsa_file or scored_kmers_file: lock = Manager().Lock() pim = load_pickle(pim_file) process_file(process, INPUT_FILE, lock=lock, pim=pim, uim_file=uim_file, fsa_file=fsa_file, scored_unitigs_file=scored_unitigs_file) separate_phenos(scored_kmers_file, outdir, params['separate-phenos'], params['no-consolidate'])
def process_file(self, name): #try to get object from database by its base name #need to use a .is_created function since whatever file is here will get processed and no longer exist exists = MediaFile.objects.filter(name=name).exists() if not exists: istv = re.search(r"S([0-9]+)E([0-9]+)", name, re.I) if istv: if self.debug: print "Found a tv show: %s for season: %s, episode %s" % ( name, istv.group(1), istv.group(2)) else: if self.debug: print "Found something that is not a tv show %s" % name if not self.test: f, created = MediaFile.objects.get_or_create(name=name) if created: #What is this? from utility import process_file if process_file(name): self.file_count += 1 else: logger.error("file:%s" % (name, )) else: self.ignore_count += 1 else: print "This is where we would process if we weren't in test mode" else: if self.debug: print "We have already encountered '%s' and are skipping it" % name self.ignore_count += 1
def process_file( self, name ): #try to get object from database by its base name #need to use a .is_created function since whatever file is here will get processed and no longer exist exists = MediaFile.objects.filter(name=name).exists() if not exists: istv = re.search(r"S([0-9]+)E([0-9]+)", name, re.I) if istv: if self.debug: print "Found a tv show: %s for season: %s, episode %s" % (name, istv.group(1), istv.group(2)) else: if self.debug: print "Found something that is not a tv show %s" % name if not self.test: f, created = MediaFile.objects.get_or_create( name=name ) if created: #What is this? from utility import process_file if process_file( name ): self.file_count += 1 else: logger.error("file:%s" % (name,) ) else: self.ignore_count += 1 else: print "This is where we would process if we weren't in test mode" else: if self.debug: print "We have already encountered '%s' and are skipping it" % name self.ignore_count += 1
def main(): # load params params = get_params() project = params['project'] k = params['k'] # define file paths samples_file = join(project, 'data', 'raw', params['sample']) outfile = join(project, 'data', 'preprocessed', 'unique_kmers.txt') catted_samples = join(project, 'data', 'preprocessed', 'samples.fa') # check if output file exists; if so, do nothing. if file_exists(outfile): exit(0) # create catted samples file if it does not exist. if not file_exists(catted_samples): cat_samples(samples_file, catted_samples) # multiprocessing queue for transferring data to the main thread q = Manager().Queue() # invoke process(...) on catted_samples files with kwargs, for each thread process_file(process, catted_samples, q=q, k=k) # consolidate all threads' counters into single counter holding all kmers counter = Counter() while not q.empty(): counter.update(q.get()) for kmer in counter.keys(): comp = complement(kmer) if comp in counter: comp_count = counter[comp] counter[comp] = 0 counter[kmer] += comp_count counter = +counter printd('Finished consolidating counters.') # write counter to file write_dict(counter, outfile, sep='\t') # remove catted samples file if file_exists(catted_samples): remove(catted_samples)
def process_item(self, item, base=None): '''processes the given item, if base != none will prepend to item if absolute path is needed, if base == none assumes item is absolute path or we are in cwd relative to item so we wouldnt need it :) ''' #get absolute path for item if base != None: name = os.path.join(base, item) else: name = item #check what type of item this is and process accordingly if os.path.isfile(name): if os.path.islink(name): #store a medialink for symlinks found medialink, created = MediaLink.objects.get_or_create( name=item, comments=name) #@TODO find mediafile it points to or create it filepath = os.readlink(name) filelocation, linkpointsto = os.path.split(filepath) f, created = MediaFile.objects.get_or_create(name=linkpointsto) medialink.mediafile = f medialink.save() else: #try to get object from database by its base name f, created = MediaFile.objects.get_or_create(name=item) if created: f.classify() f.process() from utility import process_file if process_file(name): self.file_count = self.file_count + 1 else: logger.error("file:%s" % (name, )) else: self.ignore_count = self.ignore_count + 1 elif os.path.isdir(name): #need to link it up with its parent when created here... d, created = MediaLocation.objects.get_or_create(name=item) if created: d.classify() d.process() from utility import process_dir if process_dir(name): self.dir_count = self.dir_count + 1 else: self.ignore_count = self.ignore_count + 1 self.dir_count = self.dir_count + 1 else: self.ignore_count = self.ignore_count + 1
def process_item( self, item, base=None ): '''processes the given item, if base != none will prepend to item if absolute path is needed, if base == none assumes item is absolute path or we are in cwd relative to item so we wouldnt need it :) ''' #get absolute path for item if base != None: name = os.path.join(base, item ) else: name = item #check what type of item this is and process accordingly if os.path.isfile( name ): if os.path.islink( name ): #store a medialink for symlinks found medialink, created = MediaLink.objects.get_or_create(name=item, comments=name) #@TODO find mediafile it points to or create it filepath = os.readlink(name) filelocation, linkpointsto = os.path.split(filepath) f, created = MediaFile.objects.get_or_create(name=linkpointsto) medialink.mediafile = f medialink.save() else: #try to get object from database by its base name f, created = MediaFile.objects.get_or_create( name=item ) if created: f.classify() f.process() from utility import process_file if process_file( name ): self.file_count = self.file_count+1 else: logger.error("file:%s" % (name,) ) else: self.ignore_count = self.ignore_count + 1 elif os.path.isdir( name ): #need to link it up with its parent when created here... d, created = MediaLocation.objects.get_or_create( name=item ) if created: d.classify() d.process() from utility import process_dir if process_dir( name ): self.dir_count = self.dir_count+1 else: self.ignore_count = self.ignore_count+1 self.dir_count = self.dir_count+1 else: self.ignore_count = self.ignore_count+1
def test(): print("Loading test data...") start_time = time.time() x_test, y_test = process_file(test_dir, word_to_id, config.seq_length) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 print('Testing...') loss_test, acc_test = evaluate(session, x_test, y_test) msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) batch_size = 128 data_len = len(x_test) num_batch = int((data_len - 1) / batch_size) + 1 # 载入标签 one_hot = read_obj('resources/one_hot_encoder.pkl') categories = one_hot.classes_ y_pred = np.zeros(shape=(len(x_test), len(categories)), dtype=np.int32) # 保存预测结果 for i in range(num_batch): # 逐批次处理 start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) feed_dict = { model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0 } y_pred[start_id:end_id] = session.run(tf.round( tf.nn.sigmoid(model.logits)), feed_dict=feed_dict) # 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test, y_pred, target_names=categories)) time_dif = get_time_dif(start_time) print("Time usage:", time_dif)
def main(): # get params params = get_params() project = params['project'] # define file paths unique_kmers_file = join(project, 'data', 'preprocessed', 'unique_kmers.txt') phenos_file = join(project, 'data', 'raw', params['pheno']) samples_file = join(project, 'data', 'raw', params['sample']) similarities_tsv = join(project, 'data', 'preprocessed', 'sample_similarities.tsv') hist_orig_file = join(project, 'data', 'preprocessed', 'hist_orig.png') hist_sim_scaled_file = join(project, 'data', 'preprocessed', 'hist_sim_scaled.png') hist_dissim_scaled_file = join(project, 'data', 'preprocessed', 'hist_dissim_scaled.png') similar_sample_file = join(project, 'data', 'preprocessed', 'similarSample_obs.txt') dissimilar_sample_file = join(project, 'data', 'preprocessed', 'dissimilarSample_obs.txt') kmer_sample_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt') kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt') sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') uim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') # create and load sample and pheno int maps if not file_exists(sim_file): int_maps.create_sample_int_map(samples_file, phenos_file, sim_file) if not file_exists(pim_file): int_maps.create_pheno_int_map(phenos_file, pim_file) sim = load_pickle(sim_file) # only do processing if output files do not exist if (not file_exists(kmer_sample_file) or not file_exists(kmer_pheno_file) or ((not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and not file_exists(similarities_tsv))): # dfs holding samples that display vs not display pheno dfdisp, dfnodisp = create_disp_nodisp_dfs(phenos_file, sim) # read in all sequences in input into python object seqs = parse_input(samples_file) # number of samples n_samples = int(len(sim) / 2) # upper and lower bounds for frequency of samples to filter kmers by upper = int(params['maxkf'] * n_samples) lower = int(params['minkf'] * n_samples) # multiprocessing queue for transferring data to the main thread m = Manager() q = m.Queue() # multiprocessing lock for locking file before writing to it lock = m.Lock() # kmers file name reference for subprocesses to write to kmer_sample_file_ref = kmer_sample_file # because the int map uses it if file_exists(kmer_sample_file): kmer_sample_file_ref = None if file_exists(kmer_pheno_file): kmer_pheno_file = None kwargs = dict(raw=seqs, k=params['k'], thresh=params['correlation-thresh'], upper=upper, lower=lower, dfdisp=dfdisp, dfnodisp=dfnodisp, sim=sim, n=n_samples, kmer_sample_file=kmer_sample_file_ref, kmer_pheno_file=kmer_pheno_file) process_file(create_kmer_sample_map, unique_kmers_file, q=q, lock=lock, **kwargs) sample_matrix = np.zeros((n_samples, n_samples)) num_kmers = 0 # write all chunks to output files sequentially while not q.empty(): q_num_kmers, q_sample_matrix = q.get() num_kmers += q_num_kmers sample_matrix += q_sample_matrix # create sample similarity file if the similarities tsv does not exist if not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file): similar_sample(sample_matrix, num_kmers, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similar_sample_file, dissimilar_sample_file) if (not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and file_exists(similarities_tsv): similar_sample(None, None, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similar_sample_file, dissimilar_sample_file) # create kmer int map if not file_exists(uim_file): int_maps.create_kmer_int_map(kmer_sample_file, uim_file)
def train(): print("Configuring TensorBoard and Saver...") # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 tensorboard_dir = 'tensorboard/textrnn' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) # 配置 Saver saver = tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("Loading training and validation data...") # 载入训练集与验证集 start_time = time.time() x_train, y_train = process_file(train_dir, word_to_id, config.seq_length) x_val, y_val = process_file(val_dir, word_to_id, config.seq_length) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # 创建session session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and evaluating...') start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False for epoch in range(config.num_epochs): print('Epoch:', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) if total_batch % config.save_per_batch == 0: # 每多少轮次将训练结果写入tensorboard scalar s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, x_val, y_val) if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: # 同上 break
def main(): # get params params = get_params() project = params['project'] # define data paths sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') kmer_sample_map_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt') kmer_pheno_map_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt') phenos_file = join(project, 'data', 'raw', params['pheno']) contains_sample_kmer_file = join(project, 'data', 'preprocessed', 'contains_obs.txt') value_sample_pheno_file = join(project, 'data', 'preprocessed', 'samplePheno_obs.txt') value_kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmerPheno_target.txt') similar_pheno_pheno_file = join(project, 'data', 'preprocessed', 'similarPheno_obs.txt') sim = load_pickle(sim_file) pim = load_pickle(pim_file) # incorporate truth data if params.get('truth'): truths_infile = join(project, 'data', 'raw', params['truth']) truths_dict = create_truths_dict(truths_infile, pim) truth_kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmerPheno_truth.txt') else: truths_dict = None truth_kmer_pheno_file = None # incorporate baseline data if params.get('baseline'): baseline_infile = join(project, 'data', 'raw', params['baseline']) baseline_dict = create_truths_dict(baseline_infile, pim) baseline_kmer_pheno_file = join(project, 'data', 'preprocessed', 'baseline_obs.txt') else: baseline_dict = None baseline_kmer_pheno_file = None # create smaller psl input files that can be efficiently done w 1 thread if not file_exists(value_sample_pheno_file): sample_pheno(phenos_file, sim, pim, value_sample_pheno_file) if not file_exists(similar_pheno_pheno_file): similar_pheno(phenos_file, pim, similar_pheno_pheno_file) contains_exists = file_exists(contains_sample_kmer_file) value_exists = file_exists(value_kmer_pheno_file) truths_exists = file_exists(truth_kmer_pheno_file) if params.get( 'truth') else True baseline_exists = file_exists(baseline_kmer_pheno_file) if params.get( 'baseline') else True lock = Manager().Lock() if not contains_exists: process_file(kmer_sample_db, kmer_sample_map_file, kim_file=kim_file, lock=lock, truths=truths_dict, contains_sample_kmer_file=contains_sample_kmer_file) if not value_exists or not truths_exists or not baseline_exists: if value_exists: value_kmer_pheno_file = None if truths_exists: truth_kmer_pheno_file = None if baseline_exists: baseline_kmer_pheno_file = None process_file(kmer_pheno_db, kmer_pheno_map_file, kim_file=kim_file, value_kmer_pheno_file=value_kmer_pheno_file, truth_kmer_pheno_file=truth_kmer_pheno_file, lock=lock, truths=truths_dict, baseline=baseline_dict, baseline_kmer_pheno_file=baseline_kmer_pheno_file)