Exemplo n.º 1
0
 def extract_fast5(input_file_path, bin_h, mode='DNA'):
     """
     Extract the signal and label from a single fast5 file
     Args:
         input_file_path: path of a fast5 file.
         bin_h: handle of the binary file.
         mode: The signal type dealed with. Default to 'DNA'.
     """
     try:
         (raw_data, raw_label, raw_start, raw_length) = labelop.get_label_raw(input_file_path, FLAGS.basecall_group,
                                                                              FLAGS.basecall_subgroup)
     except IOError:
         fail_list.append(input_file_path)
         return False
     except:
         fail_list.append(input_file_path)
         return False
     if mode=='rna':
         print(type(raw_data))
         raw_data = raw_data[::-1]
     if FLAGS.normalization == 'mean':
         raw_data = (raw_data - np.median(raw_data)) / np.float(np.std(raw_data))
     elif FLAGS.normalization == 'median':
         raw_data = (raw_data - np.median(raw_data)) / np.float(robust.mad(raw_data))
     pre_start = raw_start[0]
     pre_index = 0
     for index, start in enumerate(raw_start):
         if start - pre_start > FLAGS.length:
             if index - 1 == pre_index:
                 # If a single segment is longer than the maximum singal length, skip it.
                 pre_start = start
                 pre_index = index
                 continue
             event.append(np.pad(raw_data[pre_start:raw_start[index - 1]],
                                 (0, FLAGS.length + pre_start - raw_start[index - 1]), mode='constant'))
             event_length.append(int(raw_start[index - 1] - pre_start))
             label_ind = raw_label['base'][pre_index:(index - 1)]
             temp_label = [DNA_BASE[x.decode('UTF-8')] for x in label_ind]
             label.append(
                 np.pad(temp_label, (0, FLAGS.length - index + 1 + pre_index), mode='constant', constant_values=-1))
             label_length.append(index - 1 - pre_index)
             pre_index = index - 1
             pre_start = raw_start[index - 1]
         if raw_start[index] - pre_start > FLAGS.length:
             # Skip a single event segment longer than the required signal length
             pre_index = index
             pre_start = raw_start[index]
     success_list.append(input_file_path)
     while len(event) > FLAGS.batch:
         for index in range(0, FLAGS.batch):
             bin_h.write(struct.pack(format_string,
                                     *[event_length[index]] + event[index].tolist() + [label_length[index]] + label[
                                         index].tolist()))
         del event[:FLAGS.batch]
         del event_length[:FLAGS.batch]
         del label[:FLAGS.batch]
         del label_length[:FLAGS.batch]
         return True
     return False
Exemplo n.º 2
0
def extract_file(input_file,output_file):
    try:
        (raw_data, raw_label, raw_start, raw_length) = labelop.get_label_raw(input_file,FLAGS.basecall_group,FLAGS.basecall_subgroup)
    except IOError:
        return False
    except:
        return False
    f_signal = open(output_file+'.signal','w+')
    f_label = open(output_file+'.label','w+')
    f_signal.write(" ".join(str(val) for val in raw_data))
    for index,start in enumerate(raw_start):
        f_label.write("%d %d %c\n"%(start,start+raw_length[index],str(raw_label['base'][index])))
    f_signal.close()
    f_label.close()
    return True
Exemplo n.º 3
0
def extract_file(input_file):
    try:
        (raw_data, raw_label, raw_start,
         raw_length) = labelop.get_label_raw(input_file, FLAGS.basecall_group,
                                             FLAGS.basecall_subgroup)
    except Exception as e:
        print(str(e))
        return False, (None, None)

    raw_data_array = []
    for index, start in enumerate(raw_start):
        raw_data_array.append(
            [start, start + raw_length[index],
             str(raw_label['base'][index])])
    if FLAGS.mode == 'rna':
        raw_data = raw_data[::-1]
    return True, (raw_data, np.array(raw_data_array, dtype='S8'))