def test_add_data(self):
     test_string = b"AAAAAAAAbbbbbbbbCCCCCCCCdddddddd"
     self.Writer.add_data(test_string, 1)
     self.Writer.close()
     file = open(self.paths[0], "rb")
     data = file.read()
     file.close()
     index = self.Writer.index
     self.assertIn(1, index)
     test_file = os.path.join(
         os.path.dirname(__file__),
         "data",
         "uncompressed",
         "CF_07062012_pH8_2_3A.mzML",
     )
     self.Writer = GSGW(test_file,
                        max_idx=80,
                        max_idx_len=8,
                        max_offset_len=8)
     self.Writer.add_data(test_string, "a")
     self.Writer.close()
     file = open(self.paths[0], "rb")
     data = file.read()
     file.close()
     index = self.Writer.index
     self.assertIn("a", index)
 def setUp(self):
     self.paths = [
         os.path.join(os.path.dirname(__file__), "data", "unittest.mzml")
     ]
     self.Writer = GSGW(
         self.paths[0],
         max_idx=80,
         max_idx_len=8,
         max_offset_len=8,
         output_path=os.path.abspath(
             os.path.join(".", "tests", "data", "unittest.mzml")),
     )
 def setUp(self):
     self.paths = [os.path.join(os.path.dirname(__file__), "data", "unittest.mzml")]
     self.Writer = GSGW(
         self.paths[0],
         max_idx=80,
         max_idx_len=8,
         max_offset_len=8,
         output_path=os.path.abspath(
             os.path.join(".", "tests", "data", "unittest.mzml")
         ),
     )
Пример #4
0
 def setUp(self):
     self.paths = [
         os.path.join(
             os.path.dirname(__file__),
             'data',
             'unittest.mzml'
         )
     ]
     self.Writer = GSGW(
         self.paths[0],
         max_idx        = 80,
         max_idx_len    = 8,
         max_offset_len = 8,
         output_path    = os.path.abspath(
             os.path.join(
                 '.',
                 'tests',
                 'data',
                 'unittest.mzml'
                 )
         )
     )
Пример #5
0
 def test_add_data(self):
     test_string = b'AAAAAAAAbbbbbbbbCCCCCCCCdddddddd'
     self.Writer.add_data(test_string, 1)
     self.Writer.close()
     file = open(self.paths[0], 'rb')
     data = file.read()
     file.close()
     index = self.Writer.index
     self.assertIn(1, index)
     test_file = os.path.join(
         os.path.dirname(__file__),
         'data',
         'uncompressed',
         'CF_07062012_pH8_2_3A.mzML',
     )
     self.Writer = GSGW(test_file, max_idx= 80, max_idx_len=8, max_offset_len=8)
     self.Writer.add_data(test_string, 'a')
     self.Writer.close()
     file= open(self.paths[0], 'rb')
     data = file.read()
     file.close()
     index = self.Writer.index
     self.assertIn('a', index)
 def test_add_data(self):
     test_string = b"AAAAAAAAbbbbbbbbCCCCCCCCdddddddd"
     self.Writer.add_data(test_string, 1)
     self.Writer.close()
     file = open(self.paths[0], "rb")
     data = file.read()
     file.close()
     index = self.Writer.index
     self.assertIn(1, index)
     test_file = os.path.join(
         os.path.dirname(__file__),
         "data",
         "uncompressed",
         "CF_07062012_pH8_2_3A.mzML",
     )
     self.Writer = GSGW(test_file, max_idx=80, max_idx_len=8, max_offset_len=8)
     self.Writer.add_data(test_string, "a")
     self.Writer.close()
     file = open(self.paths[0], "rb")
     data = file.read()
     file.close()
     index = self.Writer.index
     self.assertIn("a", index)
Пример #7
0
def index_gzip(
    pathIn,
    pathOut,
    max_idx=10000,
    idx_len=8,
    verbose=False,
    comp_str=-1
):
    """
    Convert an mzml file (can be gzipped) into an indexed, gzipped mzML file.

    Arguments:
        pathIn (str): path to an mzML input File.
        pathOut (str): path were the index gzip will be created.

    Keyword Arguments:
        max_idx (int): number of indexes which can be saved.
        idx_len (int): character len of on key
        verbose (boolean): print progress while parsing input.
        comp_str(int): compression strength of zlib compression,
            needs to  be 1 <= x <= 9
    """
    if pathIn.endswith('gz'):
        fileOpen = gzip.open
    elif pathIn.lower().endswith('mzml'):
        fileOpen = open
    with GSGW(
        output_path=pathOut,
        max_idx=max_idx,
        max_idx_len=idx_len,
        max_offset_len=idx_len,
        comp_str=comp_str
    ) as Writer:
        with fileOpen(pathIn, 'rt') as Reader:
            data = ''
            for line in Reader:
                if line.strip().startswith('</spectrum>'):
                    data += line
                    Writer.add_data(data, nativeID)
                    if verbose:
                        print('NativeID : {0}'.format(nativeID), end='\r')
                    data = ''
                elif line.strip().startswith('<spectrum '):
                    data += line
                    lineID = re.search(
                        regex_patterns.SPECTRUM_TAG_PATTERN, line
                    ).group('index')

                    nativeID = int(regex_patterns.SPECTRUM_ID_PATTERN.search(
                        lineID
                    ).group(0))

                elif line.strip().startswith('<chromatogram '):
                    data += line
                    nativeID = re.search(
                        regex_patterns.CHROMATOGRAM_ID_PATTERN, line
                    ).group(1)
                    print('found chromatogram')
                elif line.strip().startswith('<spectrumL'):
                    data += line
                    Writer.add_data(data, 'Head')
                    if verbose:
                        print('NativeID :', 'Head')
                    data = ''
                elif line.strip().startswith('<chromatogramL'):
                    data += line
                    Writer.add_data(data, 'junk')
                    if verbose:
                        print('NativeID :', 'junk')
                    data = ''
                elif line.strip().startswith('</chromatogram>'):
                    data += line
                    Writer.add_data(data, nativeID)
                    if verbose:
                        print('found chromatogram')
                        print('NativeID: {0}'.format(nativeID))
                    data = ''
                else:
                    data += line
            if data:
                Writer.add_data(data, 'tail')
                if verbose:
                    print('NativeID :', 'tail')
        # print(Writer.index.items())
        Writer.write_index()
    return
class GSGWTest(unittest.TestCase):
    """
    TODO: Write messages for assertions
    """

    def setUp(self):
        self.paths = [os.path.join(os.path.dirname(__file__), "data", "unittest.mzml")]
        self.Writer = GSGW(
            self.paths[0],
            max_idx=80,
            max_idx_len=8,
            max_offset_len=8,
            output_path=os.path.abspath(
                os.path.join(".", "tests", "data", "unittest.mzml")
            ),
        )

    def tearDown(self):
        """
        """
        self.Writer.close()
        os.remove(os.path.abspath(os.path.join(".", "tests", "data", "unittest.mzml")))

    def test_init(self):
        self.assertEqual(self.Writer.crc32, 0)
        self.assertEqual(self.Writer.crc32, 0)
        self.assertEqual(
            self.Writer.file_name,
            os.path.abspath(
                os.path.join(
                    # os.path.dirname(__file__),
                    # 'Test',
                    "tests",
                    "data",
                    "unittest.mzml",
                )
            ),
        )

    def test_write_gen_header_index(self):
        self.Writer._write_gen_header(Index=True)
        self.Writer.close()
        file = open(self.paths[0], "rb")
        header = file.read()
        file.close()

        # Normal stuff
        self.assertEqual(header[:2], b"\x1f\x8b")  # Gzip magic bytes
        self.assertEqual(header[2], 8)  # comp
        self.assertEqual(header[3], 16)  # comment flags set
        # self.assertEqual(header[5:9], b'') # time, how to check?
        self.assertEqual(header[8], 2)  # xfl
        self.assertEqual(header[9], 3)  # os

        # extra field with index
        self.assertEqual(header[10:12], b"FU")  # magic index bytes
        self.assertEqual(header[12], 1)  # version

    def test_write_gen_header_no_index(self):
        self.Writer._write_gen_header(Index=False)
        self.Writer.close()
        file = open(self.paths[0], "rb")
        header = file.read()
        file.close()
        # Normal stuff
        self.assertEqual(header[:2], b"\x1f\x8b")
        self.assertEqual(header[2], 8)  # comp
        self.assertEqual(header[3], 0)  # comment flags set
        # self.assertEqual(header[5:9], b'') # time, how to check?
        self.assertEqual(header[8], 2)  # xfl
        self.assertEqual(header[9], 3)  # os
        self.assertEqual(len(header), 10)

    def test_allocate_index_bytes(self):
        self.Writer._allocate_index_bytes()
        self.Writer.close()
        file = open(self.paths[0], "rb")
        data = file.read()
        print(data, data[1], len(data))
        file.close()
        max_idx_num = 80
        max_idx = 8
        max_offset = 8
        self.assertEqual(len(data), ((max_idx + max_offset) * max_idx_num) + 1)

        self.assertEqual(len(data), max_idx_num * (max_idx + max_offset) + 1)

    def test_write_data(self):
        test_string = b"AAAAAAAAbbbbbbbbCCCCCCCC"
        self.Writer._write_data(test_string)
        self.Writer.close()
        Decomp = zlib.decompressobj(-zlib.MAX_WBITS)
        file = open(
            os.path.join(os.path.dirname(__file__), "data", "unittest.mzml"), "rb"
        )
        data = file.read()
        file.close()
        compData = data[:-8]
        crc = struct.unpack("<L", data[-8:-4])[0]
        isize = struct.unpack("<L", data[-4:])[0]
        self.assertEqual(Decomp.decompress(compData), test_string)
        self.assertEqual(crc, zlib.crc32(test_string))
        self.assertEqual(isize, len(test_string) % 2 ** 32)

    def test_add_data(self):
        test_string = b"AAAAAAAAbbbbbbbbCCCCCCCCdddddddd"
        self.Writer.add_data(test_string, 1)
        self.Writer.close()
        file = open(self.paths[0], "rb")
        data = file.read()
        file.close()
        index = self.Writer.index
        self.assertIn(1, index)
        test_file = os.path.join(
            os.path.dirname(__file__),
            "data",
            "uncompressed",
            "CF_07062012_pH8_2_3A.mzML",
        )
        self.Writer = GSGW(test_file, max_idx=80, max_idx_len=8, max_offset_len=8)
        self.Writer.add_data(test_string, "a")
        self.Writer.close()
        file = open(self.paths[0], "rb")
        data = file.read()
        file.close()
        index = self.Writer.index
        self.assertIn("a", index)

    def test_write_index(self):
        test_string = b"AAAAAAAAbbbbbbbbCCCCCCCCdddddddd"

        # with int as id
        # TODO assertions
        self.Writer.add_data(test_string, 1)
        self.Writer.write_index()
        self.Writer.close()
        file = open(self.paths[0], "rb")
        data = file.read()
        file.close()
        index = data[
            15 : 15 + (80 * 16) + 1
        ]  # dont read header (first 15 bytes) and read idx_num * (idx_len + offset_len bytes) and zero termination
        identifier = index[:8].decode("latin-1")
        offset = index[8:16].decode("latin-1")
        self.assertEqual(identifier.strip("\xAC"), "1")
        self.assertIsInstance(offset.strip("\xAC"), str)
Пример #9
0
def index(pathIn,
          pathOut,
          max_idx=10000,
          idx_len=8,
          verbose=False,
          comp_str=-1):
    """
    Convert an mzml file (can be gzipped) into an indexed, gzipped mzML file.

    Arguments:
        pathIn (str): path to input File.
        pathOut (str): path were output should be created.

    Keyword Arguments:
        max_idx (int): number of indexes which can be saved.
        idx_len (int): character len of on key
        verbose (boolean): print progress while parsing input.
        comp_str(int): compression strength of zlib compression,
            needs to  be 1 <= x <= 9
    """
    import gzip

    with GSGW(
            output_path=pathOut,
            max_idx_len=idx_len,
            max_offset_len=idx_len,
            comp_str=comp_str,
    ) as Writer:
        with gzip.open(pathIn, "rt") as Reader:
            data = ""
            for line in Reader:
                if line.strip().startswith("</spectrum>"):
                    data += line
                    Writer.add_data(data, nativeID)
                    if verbose:
                        pass
                    data = ""
                elif line.strip().startswith("<spectrum "):
                    data += line
                    lineID = re.search(regex_patterns.SPECTRUM_TAG_PATTERN,
                                       line).group("index")
                    nativeID = int(
                        regex_patterns.SPECTRUM_ID_PATTERN.search(
                            lineID).group(0))
                elif line.strip().startswith("<chromatogram "):
                    data += line
                    nativeID = re.search(
                        regex_patterns.CHROMATOGRAM_ID_PATTERN, line).group(1)
                elif line.strip().startswith("<spectrumL"):
                    data += line
                    Writer.add_data(data, "Head")
                    if verbose:
                        print("NativeID :", "Head")
                    data = ""
                elif line.strip().startswith("<chromatogramL"):
                    data += line
                    Writer.add_data(data, "junk")
                    if verbose:
                        print("NativeID :", "junk")
                    data = ""
                elif line.strip().startswith("</chromatogram>"):
                    data += line
                    Writer.add_data(data, nativeID)
                    if verbose:
                        print("found chromo")
                        print("NativeID :", nativeID, end="\r")
                    data = ""
                else:
                    data += line
            if data:
                Writer.add_data(data, "tail")
                if verbose:
                    print("NativeID :", "tail")
        Writer.write_index()
Пример #10
0
class GSGWTest(unittest.TestCase):
    '''
    TODO: Write messages for assertions
    '''
    def setUp(self):
        self.paths = [
            os.path.join(
                os.path.dirname(__file__),
                'data',
                'unittest.mzml'
            )
        ]
        self.Writer = GSGW(
            self.paths[0],
            max_idx        = 80,
            max_idx_len    = 8,
            max_offset_len = 8,
            output_path    = os.path.abspath(
                os.path.join(
                    '.',
                    'tests',
                    'data',
                    'unittest.mzml'
                    )
            )
        )

    def tearDown(self):
        """
        """
        self.Writer.close()
        os.remove(os.path.abspath(os.path.join('.','tests','data', 'unittest.mzml')))


    def test_init(self):
        self.assertEqual(self.Writer.crc32, 0)
        self.assertEqual(self.Writer.crc32, 0)
        self.assertEqual(
            self.Writer.file_name, 
            os.path.abspath(
                os.path.join(
                    # os.path.dirname(__file__),
                    # 'Test',
                    'tests',
                    'data',
                    'unittest.mzml'
                )
            )
        )

    def test_write_gen_header_index(self):
        self.Writer._write_gen_header(Index=True)
        self.Writer.close()
        file = open(self.paths[0], 'rb')
        header = file.read()
        file.close()

        # Normal stuff
        self.assertEqual(header[:2], b'\x1f\x8b') # Gzip magic bytes
        self.assertEqual(header[2], 8) # comp
        self.assertEqual(header[3], 16) # comment flags set
        # self.assertEqual(header[5:9], b'') # time, how to check?
        self.assertEqual(header[8], 2) # xfl
        self.assertEqual(header[9], 3) # os

        # extra field with index
        self.assertEqual(header[10:12], b'FU') # magic index bytes
        self.assertEqual(header[12], 1) # version

    def test_write_gen_header_no_index(self):
        self.Writer._write_gen_header(Index=False)
        self.Writer.close()
        file = open(self.paths[0], 'rb')
        header = file.read()
        file.close()
        # Normal stuff
        self.assertEqual(header[:2], b'\x1f\x8b')
        self.assertEqual(header[2], 8) # comp
        self.assertEqual(header[3], 0) # comment flags set
        # self.assertEqual(header[5:9], b'') # time, how to check?
        self.assertEqual(header[8], 2) # xfl
        self.assertEqual(header[9], 3) # os
        self.assertEqual(len(header), 10)

    def test_allocate_index_bytes(self):
        self.Writer._allocate_index_bytes()
        self.Writer.close()
        file = open(self.paths[0], 'rb')
        data = file.read()
        print(data, data[1], len(data))
        file.close()
        max_idx_num = 80
        max_idx = 8
        max_offset = 8
        self.assertEqual(len(data), ((max_idx + max_offset) * max_idx_num) + 1)

        self.assertEqual(len(data), max_idx_num * (max_idx + max_offset) + 1)

    def test_write_data(self):
        test_string = b'AAAAAAAAbbbbbbbbCCCCCCCC'
        self.Writer._write_data(test_string)
        self.Writer.close()
        Decomp = zlib.decompressobj(-zlib.MAX_WBITS,)
        file = open(
            os.path.join(
                os.path.dirname(__file__),
                'data',
                'unittest.mzml'
            ),
            'rb'
        )
        data = file.read()
        file.close()
        compData = data[:-8]
        crc      = struct.unpack('<L', data[-8:-4])[0]
        isize    = struct.unpack('<L', data[-4:])[0]
        self.assertEqual(Decomp.decompress(compData), test_string)
        self.assertEqual(crc, zlib.crc32(test_string))
        self.assertEqual(isize, len(test_string) % 2**32)

    def test_add_data(self):
        test_string = b'AAAAAAAAbbbbbbbbCCCCCCCCdddddddd'
        self.Writer.add_data(test_string, 1)
        self.Writer.close()
        file = open(self.paths[0], 'rb')
        data = file.read()
        file.close()
        index = self.Writer.index
        self.assertIn(1, index)
        test_file = os.path.join(
            os.path.dirname(__file__),
            'data',
            'uncompressed',
            'CF_07062012_pH8_2_3A.mzML',
        )
        self.Writer = GSGW(test_file, max_idx= 80, max_idx_len=8, max_offset_len=8)
        self.Writer.add_data(test_string, 'a')
        self.Writer.close()
        file= open(self.paths[0], 'rb')
        data = file.read()
        file.close()
        index = self.Writer.index
        self.assertIn('a', index)

    def test_write_index(self):
        test_string = b'AAAAAAAAbbbbbbbbCCCCCCCCdddddddd'

        # with int as id
        # TODO assertions
        self.Writer.add_data(test_string, 1)
        self.Writer.write_index()
        self.Writer.close()
        file = open(self.paths[0], 'rb')
        data = file.read()
        file.close()
        index = data[15:15+(80*16)+1] # dont read header (first 15 bytes) and read idx_num * (idx_len + offset_len bytes) and zero termination
        identifier = index[:8].decode('latin-1')
        offset     = index[8:16].decode('latin-1')
        self.assertEqual(identifier.strip('\xAC'), '1')
        self.assertIsInstance(offset.strip('\xAC'), str)