Пример #1
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define file paths
    INPUT_FILE = join(project, 'data', 'postprocessed', 'KMERPHENO.txt')
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    fsa_file = join(project, 'data', 'postprocessed', 'scored_kmers.fsa')
    kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')
    scored_kmers_file = join(project, 'data', 'postprocessed',
                             'scored_kmers.txt')
    outdir = join(project, 'data', 'postprocessed')

    # create output files if they do not exist
    if file_exists(fsa_file):
        fsa_file = None
    if file_exists(scored_kmers_file):
        scored_kmers_file = None
    if fsa_file or scored_kmers_file:
        lock = Manager().Lock()
        pim = load_pickle(pim_file)

        process_file(process,
                     INPUT_FILE,
                     lock=lock,
                     pim=pim,
                     uim_file=uim_file,
                     fsa_file=fsa_file,
                     scored_unitigs_file=scored_unitigs_file)
    separate_phenos(scored_kmers_file, outdir, params['separate-phenos'],
                    params['no-consolidate'])
Пример #2
0
 def process_file(self, name):
     #try to get object from database by its base name
     #need to use a .is_created function since whatever file is here will get processed and no longer exist
     exists = MediaFile.objects.filter(name=name).exists()
     if not exists:
         istv = re.search(r"S([0-9]+)E([0-9]+)", name, re.I)
         if istv:
             if self.debug:
                 print "Found a tv show: %s for season: %s, episode %s" % (
                     name, istv.group(1), istv.group(2))
         else:
             if self.debug:
                 print "Found something that is not a tv show %s" % name
         if not self.test:
             f, created = MediaFile.objects.get_or_create(name=name)
             if created:
                 #What is this?
                 from utility import process_file
                 if process_file(name):
                     self.file_count += 1
                 else:
                     logger.error("file:%s" % (name, ))
             else:
                 self.ignore_count += 1
         else:
             print "This is where we would process if we weren't in test mode"
     else:
         if self.debug:
             print "We have already encountered '%s' and are skipping it" % name
         self.ignore_count += 1
Пример #3
0
 def process_file( self, name ): 
     #try to get object from database by its base name 
     #need to use a .is_created function since whatever file is here will get processed and no longer exist
     exists = MediaFile.objects.filter(name=name).exists()
     if not exists:
         istv = re.search(r"S([0-9]+)E([0-9]+)", name, re.I)
         if istv:
             if self.debug:
                 print "Found a tv show: %s for season: %s, episode %s" % (name, istv.group(1), istv.group(2))
         else:
             if self.debug:
                 print "Found something that is not a tv show %s" % name
         if not self.test:
             f, created = MediaFile.objects.get_or_create( name=name )
             if created:
                 #What is this?
                 from utility import process_file 
                 if process_file( name ):
                     self.file_count += 1 
                 else:   
                     logger.error("file:%s" % (name,) )
             else: 
                 self.ignore_count += 1
         else:
             print "This is where we would process if we weren't in test mode"
     else: 
         if self.debug:
             print "We have already encountered '%s' and are skipping it" % name
         self.ignore_count += 1 
Пример #4
0
def main():
    # load params 
    params = get_params()
    project = params['project']
    k = params['k']

    # define file paths
    samples_file = join(project, 'data', 'raw', params['sample'])
    outfile = join(project, 'data', 'preprocessed', 'unique_kmers.txt')
    catted_samples = join(project, 'data', 'preprocessed', 'samples.fa')

    # check if output file exists; if so, do nothing.
    if file_exists(outfile):
        exit(0)

    # create catted samples file if it does not exist.
    if not file_exists(catted_samples):
        cat_samples(samples_file, catted_samples)

    # multiprocessing queue for transferring data to the main thread
    q = Manager().Queue()

    # invoke process(...) on catted_samples files with kwargs, for each thread
    process_file(process, catted_samples, q=q, k=k)
    
    # consolidate all threads' counters into single counter holding all kmers
    counter = Counter()
    while not q.empty():
        counter.update(q.get())
    for kmer in counter.keys():
        comp = complement(kmer)
        if comp in counter:
            comp_count = counter[comp]
            counter[comp] = 0
            counter[kmer] += comp_count
    counter = +counter
    printd('Finished consolidating counters.')

    # write counter to file
    write_dict(counter, outfile, sep='\t')
    
    # remove catted samples file
    if file_exists(catted_samples):
        remove(catted_samples)
Пример #5
0
 def process_item(self, item, base=None):
     '''processes the given item, if base != none will prepend to 
     item if absolute path is needed, if base == none assumes item
     is absolute path or we are in cwd relative to item so we wouldnt
     need it :) '''
     #get absolute path for item
     if base != None:
         name = os.path.join(base, item)
     else:
         name = item
     #check what type of item this is and process accordingly
     if os.path.isfile(name):
         if os.path.islink(name):
             #store a medialink for symlinks found
             medialink, created = MediaLink.objects.get_or_create(
                 name=item, comments=name)
             #@TODO find mediafile it points to or create it
             filepath = os.readlink(name)
             filelocation, linkpointsto = os.path.split(filepath)
             f, created = MediaFile.objects.get_or_create(name=linkpointsto)
             medialink.mediafile = f
             medialink.save()
         else:
             #try to get object from database by its base name
             f, created = MediaFile.objects.get_or_create(name=item)
             if created:
                 f.classify()
                 f.process()
                 from utility import process_file
                 if process_file(name):
                     self.file_count = self.file_count + 1
                 else:
                     logger.error("file:%s" % (name, ))
             else:
                 self.ignore_count = self.ignore_count + 1
     elif os.path.isdir(name):
         #need to link it up with its parent when created here...
         d, created = MediaLocation.objects.get_or_create(name=item)
         if created:
             d.classify()
             d.process()
             from utility import process_dir
             if process_dir(name):
                 self.dir_count = self.dir_count + 1
         else:
             self.ignore_count = self.ignore_count + 1
         self.dir_count = self.dir_count + 1
     else:
         self.ignore_count = self.ignore_count + 1
Пример #6
0
 def process_item( self, item, base=None ): 
     '''processes the given item, if base != none will prepend to 
     item if absolute path is needed, if base == none assumes item
     is absolute path or we are in cwd relative to item so we wouldnt
     need it :) '''
     #get absolute path for item 
     if base != None: 
         name = os.path.join(base, item )
     else:
         name = item 
     #check what type of item this is and process accordingly 
     if os.path.isfile( name ):
         if os.path.islink( name ):
             #store a medialink for symlinks found 
             medialink, created = MediaLink.objects.get_or_create(name=item, comments=name)
             #@TODO find mediafile it points to or create it
             filepath = os.readlink(name) 
             filelocation, linkpointsto = os.path.split(filepath) 
             f, created = MediaFile.objects.get_or_create(name=linkpointsto)
             medialink.mediafile = f
             medialink.save() 
         else:
             #try to get object from database by its base name 
             f, created = MediaFile.objects.get_or_create( name=item )
             if created:
                 f.classify()
                 f.process() 
                 from utility import process_file 
                 if process_file( name ):
                     self.file_count = self.file_count+1 
                 else:   
                     logger.error("file:%s" % (name,) )
             else: 
                 self.ignore_count = self.ignore_count + 1
     elif os.path.isdir( name ):
         #need to link it up with its parent when created here... 
         d, created = MediaLocation.objects.get_or_create( name=item )
         if created:
             d.classify() 
             d.process() 
             from utility import process_dir
             if process_dir( name ):
                 self.dir_count = self.dir_count+1
         else: 
             self.ignore_count = self.ignore_count+1 
         self.dir_count = self.dir_count+1
     else:
         self.ignore_count = self.ignore_count+1 
Пример #7
0
def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, config.seq_length)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    # 载入标签
    one_hot = read_obj('resources/one_hot_encoder.pkl')
    categories = one_hot.classes_

    y_pred = np.zeros(shape=(len(x_test), len(categories)),
                      dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred[start_id:end_id] = session.run(tf.round(
            tf.nn.sigmoid(model.logits)),
                                              feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test, y_pred, target_names=categories))

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)
Пример #8
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define file paths
    unique_kmers_file = join(project, 'data', 'preprocessed', 'unique_kmers.txt')
    phenos_file = join(project, 'data', 'raw', params['pheno'])
    samples_file = join(project, 'data', 'raw', params['sample'])
    similarities_tsv = join(project, 'data', 'preprocessed', 'sample_similarities.tsv')
    hist_orig_file = join(project, 'data', 'preprocessed', 'hist_orig.png')
    hist_sim_scaled_file = join(project, 'data', 'preprocessed', 'hist_sim_scaled.png')
    hist_dissim_scaled_file = join(project, 'data', 'preprocessed', 'hist_dissim_scaled.png')
    similar_sample_file = join(project, 'data', 'preprocessed', 'similarSample_obs.txt')
    dissimilar_sample_file = join(project, 'data', 'preprocessed', 'dissimilarSample_obs.txt')
    kmer_sample_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt')
    kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt')
    sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') 
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    uim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')

    # create and load sample and pheno int maps
    if not file_exists(sim_file):
        int_maps.create_sample_int_map(samples_file, phenos_file, sim_file)
    if not file_exists(pim_file):
        int_maps.create_pheno_int_map(phenos_file, pim_file)
    sim = load_pickle(sim_file)
    
    # only do processing if output files do not exist
    if (not file_exists(kmer_sample_file) or not file_exists(kmer_pheno_file) 
            or ((not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file))
            and not file_exists(similarities_tsv))):
        # dfs holding samples that display vs not display pheno
        dfdisp, dfnodisp = create_disp_nodisp_dfs(phenos_file, sim)
        # read in all sequences in input into python object
        seqs = parse_input(samples_file)
        # number of samples
        n_samples = int(len(sim) / 2)
        # upper and lower bounds for frequency of samples to filter kmers by
        upper = int(params['maxkf'] * n_samples)
        lower = int(params['minkf'] * n_samples)
        # multiprocessing queue for transferring data to the main thread
        m = Manager()
        q = m.Queue()
        # multiprocessing lock for locking file before writing to it
        lock = m.Lock()
        # kmers file name reference for subprocesses to write to
        kmer_sample_file_ref = kmer_sample_file # because the int map uses it
        if file_exists(kmer_sample_file):
            kmer_sample_file_ref = None
        if file_exists(kmer_pheno_file):
            kmer_pheno_file = None
        
        kwargs = dict(raw=seqs, k=params['k'], thresh=params['correlation-thresh'],
                    upper=upper, lower=lower, dfdisp=dfdisp, dfnodisp=dfnodisp,
                    sim=sim, n=n_samples,
                    kmer_sample_file=kmer_sample_file_ref,
                    kmer_pheno_file=kmer_pheno_file)

        process_file(create_kmer_sample_map, unique_kmers_file, q=q, lock=lock, **kwargs)
       
        sample_matrix = np.zeros((n_samples, n_samples))
        num_kmers = 0
        # write all chunks to output files sequentially
        while not q.empty():
            q_num_kmers, q_sample_matrix = q.get()
            num_kmers += q_num_kmers
            sample_matrix += q_sample_matrix
        
        # create sample similarity file if the similarities tsv does not exist
        if not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file):
            similar_sample(sample_matrix, num_kmers, similarities_tsv,
                hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file,
                similar_sample_file, dissimilar_sample_file)
    if (not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and file_exists(similarities_tsv):
        similar_sample(None, None, similarities_tsv, hist_orig_file,
            hist_sim_scaled_file, hist_dissim_scaled_file,
            similar_sample_file, dissimilar_sample_file)
    # create kmer int map
    if not file_exists(uim_file):
        int_maps.create_kmer_int_map(kmer_sample_file, uim_file)
Пример #9
0
def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
    tensorboard_dir = 'tensorboard/textrnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc],
                                                    feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(
                    msg.format(total_batch, loss_train, acc_train, loss_val,
                               acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升,提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break
Пример #10
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define data paths
    sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl')
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')
    kmer_sample_map_file = join(project, 'data', 'preprocessed',
                                'kmer_sample_map.txt')
    kmer_pheno_map_file = join(project, 'data', 'preprocessed',
                               'kmer_pheno_map.txt')
    phenos_file = join(project, 'data', 'raw', params['pheno'])
    contains_sample_kmer_file = join(project, 'data', 'preprocessed',
                                     'contains_obs.txt')
    value_sample_pheno_file = join(project, 'data', 'preprocessed',
                                   'samplePheno_obs.txt')
    value_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                 'kmerPheno_target.txt')
    similar_pheno_pheno_file = join(project, 'data', 'preprocessed',
                                    'similarPheno_obs.txt')

    sim = load_pickle(sim_file)
    pim = load_pickle(pim_file)

    # incorporate truth data
    if params.get('truth'):
        truths_infile = join(project, 'data', 'raw', params['truth'])
        truths_dict = create_truths_dict(truths_infile, pim)
        truth_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                     'kmerPheno_truth.txt')
    else:
        truths_dict = None
        truth_kmer_pheno_file = None

    # incorporate baseline data
    if params.get('baseline'):
        baseline_infile = join(project, 'data', 'raw', params['baseline'])
        baseline_dict = create_truths_dict(baseline_infile, pim)
        baseline_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                        'baseline_obs.txt')
    else:
        baseline_dict = None
        baseline_kmer_pheno_file = None

    # create smaller psl input files that can be efficiently done w 1 thread
    if not file_exists(value_sample_pheno_file):
        sample_pheno(phenos_file, sim, pim, value_sample_pheno_file)
    if not file_exists(similar_pheno_pheno_file):
        similar_pheno(phenos_file, pim, similar_pheno_pheno_file)

    contains_exists = file_exists(contains_sample_kmer_file)
    value_exists = file_exists(value_kmer_pheno_file)
    truths_exists = file_exists(truth_kmer_pheno_file) if params.get(
        'truth') else True
    baseline_exists = file_exists(baseline_kmer_pheno_file) if params.get(
        'baseline') else True

    lock = Manager().Lock()

    if not contains_exists:
        process_file(kmer_sample_db,
                     kmer_sample_map_file,
                     kim_file=kim_file,
                     lock=lock,
                     truths=truths_dict,
                     contains_sample_kmer_file=contains_sample_kmer_file)

    if not value_exists or not truths_exists or not baseline_exists:
        if value_exists:
            value_kmer_pheno_file = None
        if truths_exists:
            truth_kmer_pheno_file = None
        if baseline_exists:
            baseline_kmer_pheno_file = None
        process_file(kmer_pheno_db,
                     kmer_pheno_map_file,
                     kim_file=kim_file,
                     value_kmer_pheno_file=value_kmer_pheno_file,
                     truth_kmer_pheno_file=truth_kmer_pheno_file,
                     lock=lock,
                     truths=truths_dict,
                     baseline=baseline_dict,
                     baseline_kmer_pheno_file=baseline_kmer_pheno_file)