def test_check_HDF5_mapped_read_file(self):
        """Check that constructing a read object which doesn't conform
        leads to errors.
        """
        print("Creating flawed Read object from test data")
        read_dict = construct_mapped_read()
        read_dict['Reference'] = "I'm not a numpy array!"  # Wrong type!
        read_object = mapped_signal_files.Read(read_dict)
        print("Checking contents")
        check_text = read_object.check()
        print("Check result on read object: should fail")
        print(check_text)
        self.assertNotEqual(check_text, "pass")

        print("Writing to file")
        with mapped_signal_files.HDF5(self.testfilepath, "w") as f:
            f.write_read(read_object['read_id'], read_object)
            f.write_version_number(7)

        print("Current dir = ", os.getcwd())
        print("File written to ", self.testfilepath)

        print("\nOpening file for reading")
        with mapped_signal_files.HDF5(self.testfilepath, "r") as f:
            ids = f.get_read_ids()
            print("Read ids=", ids[0])
            print("Version number = ", f.get_version_number())
            self.assertEqual(ids[0], read_dict['read_id'])

            file_test_report = f.check()
            print("Test report (should fail):", file_test_report)
            self.assertNotEqual(file_test_report, "pass")

        #raise Exception("Fail so we can read output")
        return
Exemplo n.º 2
0
    def test_prepare_remap(self):
        print("Current directory is", os.getcwd())
        print("Taiyaki dir is", self.taiyakidir)
        print("Data dir is ", self.datadir)
        cmd = [
            self.script, self.read_dir, self.per_read_params,
            self.output_mapped_signal_file, self.remapping_model,
            self.per_read_refs, "--device", "cpu"
        ]
        r = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("Result of running make command in shell:")
        print("Stdout=", r.stdout.decode('utf-8'))
        print("Stderr=", r.stderr.decode('utf-8'))

        # Open mapped read file and run checks to see if it complies with file format
        # Also get a chunk and check that speed is within reasonable bounds
        with mapped_signal_files.HDF5(self.output_mapped_signal_file,
                                      "r") as f:
            testreport = f.check()
            print("Test report from checking mapped read file:")
            print(testreport)
            self.assertEqual(testreport, "pass")
            read0 = f.get_multiple_reads("all")[0]
            chunk = read0.get_chunk_with_sample_length(1000, start_sample=10)
            # Defined start_sample to make it reproducible - otherwise randomly
            # located chunk is returned.
            chunk_meandwell = len(
                chunk['current']) / (len(chunk['sequence']) + 0.0001)
            print("chunk mean dwell time in samples = ", chunk_meandwell)
            assert 7 < chunk_meandwell < 13, "Chunk mean dwell time outside allowed range 7 to 13"

        return
Exemplo n.º 3
0
def load_read_data(input_files, read_limit, log, read_ids):
    read_data = []
    for input_file in input_files:
        log.write('* Loading data from {}\n'.format(input_file))
        log.write('* Per read file MD5 {}\n'.format(helpers.file_md5(input_file)))
        with mapped_signal_files.HDF5(input_file, "r") as per_read_file:
            read_data += per_read_file.get_multiple_reads(read_ids, max_reads=read_limit)
            # read_data now contains a list of reads
            # (each an instance of the Read class defined in mapped_signal_files.py, based on dict)
    random.shuffle(read_data)
    return read_data, ['A', 'C', 'G', 'T']
Exemplo n.º 4
0
def generate_output_from_results(results, args):
    """
    Given an iterable of dictionaries, each representing the results of mapping
    a single read, output a mapped-read file.
    This version outputs to the V7 'chunk' file format (actually containing mapped reads, not chunks)

    param: results     : an iterable of read dictionaries
                         (with mappings)
    param: args        : command line args object
    """
    progress = helpers.Progress()

    # filter removes None and False and 0; filter(None,  is same as filter(o:o,
    read_ids = []
    with mapped_signal_files.HDF5(args.output, "w") as f:
        f.write_version_number()
        for readnumber, resultdict in enumerate(filter(None, results)):
            progress.step()
            read_id = resultdict['read_id']
            read_ids.append(read_id)
            f.write_read(read_id, mapped_signal_files.Read(resultdict))
Exemplo n.º 5
0
parser.add_argument(
    '--nreads',
    type=Positive(int),
    default=10,
    help='Number of reads to plot. Not used if read_ids are given')
parser.add_argument(
    '--read_ids',
    nargs='+',
    default=[],
    help=
    'One or more read_ids. If not present, plots the first NREADS in the file')

if __name__ == "__main__":
    args = parser.parse_args()
    print("Opening ", args.mapped_read_file)
    with mapped_signal_files.HDF5(args.mapped_read_file, "r") as h5:
        all_read_ids = h5.get_read_ids()
        print("First ten read_ids in file:")
        for read_id in all_read_ids[:10]:
            print("    ", read_id)
        if len(args.read_ids) > 0:
            read_ids = args.read_ids
        else:
            read_ids = all_read_ids[:args.nreads]
            print("Plotting first ", args.nreads, "read ids in file")
        plt.figure(figsize=(12, 10))
        for nread, read_id in enumerate(read_ids):
            print("Opening read id ", read_id)
            r = h5.get_read(read_id)
            mapping = r['Ref_to_signal']
            f = mapping >= 0
    def test_HDF5_mapped_read_file(self):
        """Test that we can save a mapped read file, open it again and
        use some methods to get data from it. Plot a picture for diagnostics.
        """

        print("Creating Read object from test data")
        read_dict = construct_mapped_read()
        read_object = mapped_signal_files.Read(read_dict)
        print("Checking contents")
        check_text = read_object.check()
        print("Check result on read object:")
        print(check_text)
        self.assertEqual(check_text, "pass")

        print("Writing to file")
        with mapped_signal_files.HDF5(self.testfilepath, "w") as f:
            f.write_read(read_object['read_id'], read_object)
            f.write_version_number(7)

        print("Current dir = ", os.getcwd())
        print("File written to ", self.testfilepath)

        print("\nOpening file for reading")
        with mapped_signal_files.HDF5(self.testfilepath, "r") as f:
            ids = f.get_read_ids()
            print("Read ids=", ids[0])
            print("Version number = ", f.get_version_number())
            self.assertEqual(ids[0], read_dict['read_id'])

            file_test_report = f.check()
            print("Test report:", file_test_report)
            self.assertEqual(file_test_report, "pass")

            read_list = f.get_multiple_reads("all")

        recovered_read = read_list[0]
        reflen = len(recovered_read['Reference'])
        siglen = len(recovered_read['Dacs'])

        # Get a chunk - note that chunkstart is relative to the start of the mapped
        # region, not relative to the start of the signal
        chunklen, chunkstart = 5, 3
        chunkdict = recovered_read.get_chunk_with_sample_length(
            chunklen, chunkstart)

        # Check that the extracted chunk is the right length
        self.assertEqual(len(chunkdict['current']), chunklen)

        # Check that the mapping data agrees with what we put in
        self.assertTrue(
            np.all(
                recovered_read['Ref_to_signal'] == read_dict['Ref_to_signal']))

        # Plot a picture showing ref_to_sig from the read object,    def setup():
        # and the result of searches to find the inverse
        if False:
            plt.figure()
            plt.xlabel('Signal coord')
            plt.ylabel('Ref coord')
            ix = np.array([0, -1])
            plt.scatter(chunkdict['current'][ix],
                        chunkdict['sequence'][ix],
                        s=50,
                        label='chunk limits',
                        marker='s',
                        color='black')
            plt.scatter(recovered_read['Ref_to_signal'],
                        np.arange(reflen + 1),
                        label='reftosig (source data)',
                        color='none',
                        edgecolor='blue',
                        s=60)
            siglocs = np.arange(siglen, dtype=np.int32)
            sigtoref_fromsearch = recovered_read.get_reference_locations(
                siglocs)
            plt.scatter(siglocs,
                        sigtoref_fromsearch,
                        label='from search',
                        color='red',
                        marker='x',
                        s=50)
            plt.legend()
            plt.grid()
            plt.savefig(self.plotfilepath)
            print("Saved plot to", self.plotfilepath)

        #raise Exception("Fail so we can read output")
        return
Exemplo n.º 7
0
    log.write('* Loading data from {}\n'.format(args.input))
    log.write('* Per read file MD5 {}\n'.format(helpers.file_md5(args.input)))

    if args.input_strand_list is not None:
        read_ids = list(set(helpers.get_read_ids(args.input_strand_list)))
        log.write(
            '* Will train from a subset of {} strands, determined by read_ids in input strand list\n'
            .format(len(read_ids)))
    else:
        log.write('* Will train from all strands\n')
        read_ids = 'all'

    if args.limit is not None:
        log.write('* Limiting number of strands to {}\n'.format(args.limit))

    with mapped_signal_files.HDF5(args.input, "r") as per_read_file:
        read_data = per_read_file.get_multiple_reads(read_ids,
                                                     max_reads=args.limit)
        # read_data now contains a list of reads
        # (each an instance of the Read class defined in mapped_signal_files.py, based on dict)

    log.write('* Loaded {} reads.\n'.format(len(read_data)))

    # Get parameters for filtering by sampling a subset of the reads
    # Result is a tuple median mean_dwell, mad mean_dwell
    # Choose a chunk length in the middle of the range for this
    sampling_chunk_len = (args.chunk_len_min + args.chunk_len_max) // 2
    filter_parameters = chunk_selection.sample_filter_parameters(
        read_data,
        args.sample_nreads_before_filtering,
        sampling_chunk_len,