Exemplo n.º 1
0
    def _initialize(self, reset=False):

        if not reset:

            try:

                print("Initializing VariantHDF5 ...")

                print("\tReading {} ...".format(self._variant_hdf5_file_path))

                self._variant_hdf5 = open_file(self._variant_hdf5_file_path,
                                               mode="r")

                print("\tReading {} ...".format(
                    self._id_chrom_pickle_gz_file_path))

                with gzip_open(self._id_chrom_pickle_gz_file_path
                               ) as id_chrom_pickle_gz_file:

                    self._id_chrom = load(id_chrom_pickle_gz_file)

                print("\tReading {} ...".format(
                    self._gene_chrom_pickle_gz_file_path))

                with gzip_open(self._gene_chrom_pickle_gz_file_path
                               ) as gene_chrom_pickle_gz_file:

                    self._gene_chrom = load(gene_chrom_pickle_gz_file)

            except (OSError, FileNotFoundError, HDF5ExtError) as exception:

                warn("\tFailed: {}.".format(exception))

                reset = True

        if reset:

            print("Resetting ...")

            if self._variant_hdf5:

                self._variant_hdf5.close()

                print("\tClosed {} ...".format(self._variant_hdf5_file_path))

            print("\tMaking {} ...".format(self._variant_hdf5_file_path))

            self._make_variant_hdf5()

            print("\tReading {} ...".format(self._variant_hdf5_file_path))

            self._variant_hdf5 = open_file(self._variant_hdf5_file_path,
                                           mode="r")
Exemplo n.º 2
0
def corpus_reader(
        path: str,
        from_memory: bool = False
) -> Union[Tuple[str, None], Tuple[None, str]]:
    """Function to read corpus text file.
    
    Args:
      path: Path to the file.
      from_memory: To "read" from memory
    
    Returns:
      Corpus text and error string in case of any.
    
    Raises:
      IOError: Occurred on reading/unpacking error.
    """
    try:
        if from_memory:
            return path.read(), None
        else:
            if path.endswith(".gz"):
                with gzip_open(path, 'rb') as f:
                    return f.read(), None
            else:
                with open(path, 'r', encoding='utf-8') as f:
                    return f.read(), None
    except IOError as ex:
        return None, ex
Exemplo n.º 3
0
def gunzip(source_filepath,
           dest_filepath=None,
           block_size=65536,
           remove_source=False,
           stdout_file=None):
    if not dest_filepath:
        dest_filepath = source_filepath.strip('.gz')
    if os.path.isdir(dest_filepath):
        file_name = source_filepath.split(SPLITTER)[-1].replace('.gz', '')
        dest_filepath = add_slash(dest_filepath) + file_name
    print('Gunzipping ',
          source_filepath,
          'to',
          dest_filepath,
          flush=True,
          file=stdout_file)
    with gzip_open(source_filepath, 'rb') as s_file, \
            open(dest_filepath, 'wb') as d_file:
        while True:
            block = s_file.read(block_size)
            if not block:
                break
            else:
                d_file.write(block)
        d_file.write(block)
    if remove_source: os.remove(source_filepath)
Exemplo n.º 4
0
 def walk(self, file_name):
     if file_name.endswith('.gz'):
         with gzip_open(file_name) as current_file:
             yield self._get_submit_from_xml(current_file)
     else:
         with open(file_name, encoding='utf-8') as current_file:
             yield self._get_submit_from_xml(current_file)
def read_file(input_file_path):
    infile = None
    if input_file_path.endswith('.gz'):
        infile = gzip_open(input_file_path)
    else:
        infile = open(input_file_path, 'r+b')

    count = 0
    total_records = 0
    chunk = StringIO()
    for line in infile:
        parsed = None
        if len(UNIPROT_RECORD_TERMINATOR.findall(line)) == 0:
            chunk.write(line)
        else:
            count += 1
            if count >= PRINT_LIMIT:
                total_records += count
                count = 0
                logging.info("Processed " + str(total_records))

            try:
                parsed = parse_chunk(chunk)
            except Exception as e:
                e.message
                pass

            chunk = StringIO()
            if parsed is None:
                continue

            yield parsed

    infile.close()
Exemplo n.º 6
0
def fetch_and_archive(service, email, archive_path, mid_list):

    logger.info(
        'fetch_and_archive started. email: %s, archive_path: %s, mid_list: %d message(s)' %
        (email, archive_path, len(mid_list))
    )

    if path_isabs(archive_path):
        output_dir = realpath(archive_path)
    else:
        output_dir = realpath(expanduser(path_join(getcwd(), archive_path)))

    count = 0
    error = 0

    for mid in mid_list:

        file_name = path_join(output_dir, ('%x.gz' % mid))
        message = fetch_mail(service, email, mid)

        if not message:
            error += 1
            continue

        with gzip_open(file_name, 'wb') as f:
            f.write(urlsafe_b64decode(message['raw']))
            logger.debug('Message id %x gzipped to %s.' % (mid, file_name))

        count += 1

    logger.info('fetch_and_archive completed. Total %d item(s) saved. Error %d item(s).' % (count, error))
Exemplo n.º 7
0
def load_from_pickle_gz(file):
    file = gzip_open(file, "rb")
    while True:
        try:
            yield pickle.load(file)
        except (EOFError, pickle.UnpicklingError):
            return
Exemplo n.º 8
0
def read_file(input_file_path):
    infile = None
    if input_file_path.endswith('.gz'):
        infile = gzip_open(input_file_path)
    else:
        infile = open(input_file_path, 'r+b')

    count = 0
    total_records = 0
    chunk = StringIO()
    for line in infile:
        parsed = None
        if len(UNIPROT_RECORD_TERMINATOR.findall(line)) == 0:
            chunk.write(line)
        else:
            count += 1
            if count >= PRINT_LIMIT:
                total_records += count
                count = 0
                logging.info("Processed " + str(total_records))

            try:
                parsed = parse_chunk(chunk)
            except Exception as e:
                e.message
                pass

            chunk = StringIO()
            if parsed is None:
                continue

            yield parsed

    infile.close()
 def _load_index_json(self, app_cache):
     index_json_gz_filename = os.path.join(app_cache.get_cache_dir(),
                                           '.index.json.gz')
     self._verify_file(index_json_gz_filename)
     with gzip_open(index_json_gz_filename, 'rb') as fgzip:
         content = fgzip.read()
         return loads(content)
def read_file(input_file_path):
    infile = None
    if input_file_path.endswith('.gz'):
        infile = gzip_open(input_file_path)
    else:
        infile = open(input_file_path, 'r+b')

    count = 0
    total_records = 0

    csv_reader = reader(infile, delimiter='\t')

    for line in csv_reader:
        count += 1
        if count >= PRINT_LIMIT:
            total_records += count
            count = 0
            logging.info("Processed " + str(total_records))

        item = {}
        for index, field in enumerate(FIELDNAMES):
            item[field] = line[index]

        item['start'] = int(item['start'])
        item['end'] = int(item['end'])
        yield item

    infile.close()
Exemplo n.º 11
0
def merge_contents(filelist):
    """
    Merges a list of Contents files and returns a dict of the merged files
    """
    pkgs = {}
    for i in filelist:
        if i and isfile(i):
            cfile = gzip_open(i).read()
            cfile = cfile.decode('utf-8')
            contents = cfile.split('\n')

            header = False
            for line in contents:
                if line.startswith('This file maps each file'):
                    header = True
                if line.startswith('FILE'):
                    header = False
                    continue

                if line != '' and not header:
                    sin = line.split()
                    if sin[-1] not in pkgs.keys():
                        pkgs[sin[-1]] = []
                    pkgs[sin[-1]].append(' '.join(sin[:-1]))
    return pkgs
Exemplo n.º 12
0
    def _extract_memory_info(self, dump_pathname, processor_notes):
        """Extract and return the JSON data from the .json.gz memory report.
        file"""
        def error_out(error_message):
            processor_notes.append(error_message)
            return {"ERROR": error_message}

        try:
            fd = gzip_open(dump_pathname, "rb")
        except IOError as x:
            error_message = "error in gzip for %s: %r" % (dump_pathname, x)
            return error_out(error_message)

        try:
            memory_info_as_string = fd.read()
            if len(memory_info_as_string) > self.config.max_size_uncompressed:
                error_message = (
                    "Uncompressed memory info too large %d (max: %d)" % (
                        len(memory_info_as_string),
                        self.config.max_size_uncompressed,
                    )
                )
                return error_out(error_message)

            memory_info = json_loads(memory_info_as_string)
        except IOError as x:
            error_message = "error in gzip for %s: %r" % (dump_pathname, x)
            return error_out(error_message)
        except ValueError as x:
            error_message = "error in json for %s: %r" % (dump_pathname, x)
            return error_out(error_message)
        finally:
            fd.close()

        return memory_info
Exemplo n.º 13
0
    def __init__(self, db, filename, compressed = False, index_offset_bits = 32):
        """
        
        Arguments:
        - `filename`: the filename of .idx file of stardict.
        - `compressed`: indicate whether the .idx file is compressed.
        - `index_offset_bits`: the offset field length in bits.
        """
        self.db = db
        self._offset = 0

        s = self.db["_word_idx"].select().limit(1)
        rs = s.execute()
        result = rs.fetchone()

        if result is None :
            self._size = getsize(filename)
            if compressed:
                self.fh = gzip_open(filename, "rb")
            else:
                self.fh = open(filename, "rb")

            self._index = 0
            self._index_offset_bits = index_offset_bits
            #self.db["_word_idx"] = OOBTree()
            #self.db["_index_idx"] = OOBTree()
            trans = self.db["conn"].begin()
            for word_str, word_data_offset, word_data_size, index in self:
                #self.db["_index_idx"][self._index - 1] = (word_str, word_data_offset, word_data_size)

                i = self.db["_index_idx"].insert().values(idx = self._index - 1,
                          word_str = word_str.decode("utf-8"),
                          word_data_offset = word_data_offset,
                          word_data_size = word_data_size)

                self.db["conn"].execute(i)
                #if word_str not in self.db["_word_idx"]:
                #    self.db["_word_idx"][word_str] = []
                #self.db["_word_idx"][word_str].append(self._index - 1)
                s = self.db["_word_idx"].select().where(self.db["_word_idx"].c.word_str == word_str.decode("utf-8"))
                rs = s.execute()
                result = rs.fetchone()
                t = time()
                if result is None :
                    i = self.db["_word_idx"].insert().values(word_str = word_str.decode("utf-8"), idx = str([]))
                    self.db["conn"].execute(i)
                    rs = s.execute()
                    result = rs.fetchone()

                newlist = eval(result[1])
                newlist.append(self._index - 1)
                j = self.db["_word_idx"].update().values(idx = str(newlist)).where(self.db["_word_idx"].c.word_str == word_str.decode("utf-8"))

                self.db["conn"].execute(j)

            trans.commit()
            del self._index_offset_bits

            mdebug("There were " + str(self._offset) + " total words.")
Exemplo n.º 14
0
def full_io(url, path, remove=True):
    path = UnsyncFetch.fetch_file(semaphore, 'get', dict(url=url), path,
                                  1).result()
    with gzip_open(path, 'rt') as handle:
        mmcif_dict = MMCIF2DictPlus(handle, ('_pdbe_chain_remapping.', ))
    if remove:
        path.unlink()
    return mmcif_dict
Exemplo n.º 15
0
def main():

    # parse the command line

    requireEof = True
    markEndOfFile = False
    filenames = []

    for arg in argv[1:]:
        if (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg in ["--markend]", "--markeof"]):
            requireEof = False
            markEndOfFile = True
        else:
            filenames += [arg]

    if (filenames == []):
        usage("you have to give me at least one file")

    # copy the files;  note that we don't bother (or care) to verify that they
    # are really output from ncrf

    for (ix, filename) in enumerate(filenames):
        if (ix > 0): print

        eofMarkerSeen = False

        if (filename.endswith(".gz")) or (filename.endswith(".gzip")):
            f = gzip_open(filename, "rt")
        else:
            f = file(filename, "rt")

        for line in f:
            line = line.rstrip("\n")
            if (eofMarkerSeen) and (line != ""):
                exit(
                    "%s: \"%s\" contains additional stuff after end marker (starting with \"%s\")"
                    % (os_path.basename(argv[0]), filename, line[:10]))
            if (line == "# ncrf end-of-file"):
                eofMarkerSeen = True
                markEndOfFile = True
                continue
            if (not eofMarkerSeen):
                try:
                    print line
                except IOError, ex:
                    # "Broken pipe" can happen when downstream tools reject
                    # our output as their input
                    if (ex.errno == EPIPE):
                        exit("%s: [Errno %d] Broken pipe" %
                             (os_path.basename(argv[0]), ex.errno))

        f.close()

        if (requireEof) and (not eofMarkerSeen):
            exit("%s: \"%s\" may have been truncated (end marker is absent)" %
                 (os_path.basename(argv[0]), filename))
Exemplo n.º 16
0
def _from_dbs_and_cache(fn, ds):
    print 'hitting DBS %s for %s' % (ds, fn)
    from JMTucker.Tools.DBS import files_in_dataset
    files = files_in_dataset(ds, instance='phys03' if ds.endswith('/USER') else 'global')
    with gzip_open(fn, 'w') as f:
        for file in files:
            f.write(file)
            f.write('\n')
    return files
Exemplo n.º 17
0
def _read_gzip_file(fn):
    print 'getting minbias file list from cache', fn
    files = []
    with gzip_open(fn) as f:
        for line in f:
            line = line.strip()
            if line:
                files.append(line)
    return files
Exemplo n.º 18
0
def _read_gzip_file(fn):
    print 'getting minbias file list from cache', fn
    files = []
    with gzip_open(fn) as f:
        for line in f:
            line = line.strip()
            if line:
                files.append(line)
    return files
Exemplo n.º 19
0
def _from_dbs_and_cache(fn, ds):
    print 'hitting DBS %s for %s' % (ds, fn)
    from JMTucker.Tools.DBS import files_in_dataset
    files = files_in_dataset(ds, instance='phys03' if ds.endswith('/USER') else 'global')
    with gzip_open(fn, 'w') as f:
        for file in files:
            f.write(file)
            f.write('\n')
    return files
Exemplo n.º 20
0
def trans_gz(gz_file: str, out_dir: str):
    if gz_file.split('.')[-1] == 'gz':
        with gzip_open(gz_file) as fr:
            with open(os.path.join(out_dir, 'prs_data'), 'w') as fw:
                for line in fr:
                    fw.write(line.decode())
        return os.path.join(out_dir, 'prs_data')
    else:
        return gz_file
Exemplo n.º 21
0
    def __init__(self, filename=JSONL_FILENAME):
        """Setup file for reading.

        Parameters
        ----------
        filename : str
            Filename for JSONL file with CVR data.

        """
        self.filename = filename
        if filename.endswith('.gz'):
            self.fid = gzip_open(filename, mode='rt')
        else:
            try:
                self.fid = open(filename)
            except IOError:
                self.fid = gzip_open(filename + '.gz', mode='rt')
        self.line_number = 0
Exemplo n.º 22
0
 def _extract_memory_info(self, dump_pathname, processor_notes):
     """Extract and return the JSON data from the .json.gz memory report.
     file"""
     try:
         fd = gzip_open(dump_pathname, "rb")
     except IOError, x:
         error_message = "error in gzip for %s: %r" % (dump_pathname, x)
         processor_notes.append(error_message)
         return {"ERROR": error_message}
Exemplo n.º 23
0
 def _extract_memory_info(self, dump_pathname, processor_notes):
     """Extract and return the JSON data from the .json.gz memory report.
     file"""
     try:
         fd = gzip_open(dump_pathname, "rb")
     except IOError, x:
         error_message = "error in gzip for %s: %r" % (dump_pathname, x)
         processor_notes.append(error_message)
         return {"ERROR": error_message}
Exemplo n.º 24
0
 def __init__(self, filename):
     """
     Parameters
     ----------
     filename: str
          full path input file name
     """
     super().__init__(filename)
     self.file_object = gzip_open(filename, 'rb')
Exemplo n.º 25
0
def dump_gps_map(gps_map, pickle_gz_file_path):

    if not pickle_gz_file_path.endswith(".pickle.gz"):

        pickle_gz_file_path += ".pickle.gz"

    with gzip_open(pickle_gz_file_path, mode="wb") as pickle_gz_file:

        dump(gps_map, pickle_gz_file)
Exemplo n.º 26
0
 def __init__(self, filename):
     """
     Parameters
     ----------
     filename: str
          full path input file name
     """
     super().__init__(filename)
     self.file_object = gzip_open(filename, 'rb')
Exemplo n.º 27
0
    def __init__(self, file):
        tmp = load(gzip_open(file, 'rb'))
        self.__models = tmp['models']
        self.__conf = tmp['config']
        self.__workpath = '.'

        self.Nlim = self.__conf.get('nlim', 1)
        self.TOL = self.__conf.get('tol', 1e10)
        self.__units = self.__conf.get('report_units')
        self.__show_structures = self.__conf.get('show_structures')
Exemplo n.º 28
0
    def __save(self):
        stashdump = self.__calc_stashdump()
        if stashdump is not None:
            with gzip_open(self.__fname, 'wb') as f:
                f.write(stashdump)

        if len(self.__properties) and self.__properties_changed:
            with self.__stash_lock:
                with open(self.__pname, 'w') as f:
                    json.dump(self.__properties, f)
Exemplo n.º 29
0
    def cache_data(self, hash, file, data, url=True, gzip=True):
        if gzip and self.gzip:
            with gzip_open(self.get_path(hash, file + '.gz'), 'wb') as f:
                f.write(data)

        with open(self.get_path(hash, file), 'wb') as f:
            f.write(data)

        if url:
            return self.get_url(hash, file)
Exemplo n.º 30
0
def save_message(mid, message, store_path):
    """
    :type  mid:        int   message id
    :type  message:    str   raw MIME message
    :param store_path: str   path to store
    :return:
    """
    path = path_join(store_path, '%x.gz' % mid)
    with gzip_open(path, 'wb') as f:
        f.write(message)
Exemplo n.º 31
0
def load_message(mid, store_path):
    """
    :type mid:         int    message id
    :type store_path:  str    path to load
    :return:
    """
    path = path_join(store_path, '%x.gz' % mid)
    with gzip_open(path, 'rb') as f:
        message = f.read()
    return message
Exemplo n.º 32
0
 def store_cache_to_file(self, _attr_name, data, gzip=False):
     filename = self.cache_dir + '/' + _attr_name + '.pickle'
     pickle_obj = pickle_dumps(data)
     if gzip:
         filename += '.gz'
         with gzip_open(filename, "wb") as f:
             f.write(pickle_obj)
     else:
         with open(filename, "wb") as f:
             f.write(pickle_obj)
def step04( datadir ):

  uvalues_by_dim = {};


  with gzip_open( datadir+"/train_trn.tsv.gz", "rt" ) as f:

    firstline = f.readline();
    if firstline and firstline[-1] == '\n':
      firstline = firstline[:-1];
    firstline = firstline.split( '\t' );

    assert \
         firstline \
      == (   [ '"id"', '"y"', '"cId"' ]
           + [ '"x{}"'.format(i) for i in range(1,101) ] );

    for line in f:

      if line and line[-1] == '\n':
        line = line[:-1];
      line = line.split( '\t' );

      id_ = line[0];
      y = line[1];
      cid = line[2];
      x = [ None ] + line[3:];

      for dim in range(1,101):

        if not dim in uvalues_by_dim:
          uvalues_by_dim[ dim ] = set();
        if len( uvalues_by_dim[dim] ) < 500:
          uvalues_by_dim[ dim ].add( x[dim] );

  dim_by_uvalues = [];
  binary = [];
  for ( dim, uvalues ) in uvalues_by_dim.items():
    if len( uvalues ) < 5:
      uvalues_ = uvalues;
    else:
      uvalues_ = None;
    if len( uvalues ) == 2:
      binary.append( dim );
    dim_by_uvalues.append( ( len(uvalues), dim, uvalues_ ) );


  with open( datadir+"/step04.txt", "wt" ) as out:

    for ( uvalues, dim, uvalues_ ) in sorted( dim_by_uvalues ):
      print( "{:3d} {:7d} {:s}".format( dim, uvalues, repr(uvalues_) ) );
      print( "{:3d} {:7d} {:s}".format( dim, uvalues, repr(uvalues_) ), file=out );

    print( "-->", repr(binary) );
    print( "-->", repr(binary), file=out );
Exemplo n.º 34
0
def gzip_file(source_path, archive_path):
    """
    Create a gzip compressed archive of ``source_path`` at ``archive_path``.
    An empty archive file will be created if the source file does not exist.
    This gives the diagnostic archive a consistent set of files which can
    easily be tested.
    """
    with gzip_open(archive_path, 'wb') as archive:
        if os.path.isfile(source_path):
            with open(source_path, 'rb') as source:
                copyfileobj(source, archive)
Exemplo n.º 35
0
 def test_downsample_16bit_image(self):
     with pushd_popd(tempdir=True) as tempdir:
         with gzip_open(join(dirname(__file__), 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz'), 'rb') as gzip_in:
             with open('16bit.tif', 'wb') as tif_out:
                 tif_out.write(gzip_in.read())
         ws = self.resolver.workspace_from_nothing(directory=tempdir)
         ws.add_file('IMG', ID='foo', url='16bit.tif', mimetype='image/tiff', pageId=None)
         pil_before = Image.open('16bit.tif')
         assert pil_before.mode == 'I;16'
         pil_after = ws._resolve_image_as_pil('16bit.tif')
         assert pil_after.mode == 'L'
Exemplo n.º 36
0
def load_packages_file(filename):
    """
    Load a gzip'd packages file.
    Returns a dictionary of package name and package key-values.
    """
    # TODO: should we skip files like this if they don't exist?
    if filename is not None and isfile(filename):
        packages_contents = gzip_open(filename).read()
        packages_contents = packages_contents.decode('utf-8')
        return parse_packages(packages_contents)

    return None
Exemplo n.º 37
0
    def open(self, filename):
        """
        Open specified file for writing.  File will be compressed
        if the gzip flag of the constructor was set to True.

        Args:
            filename (str): path to file to open for writing
        """
        if self.gzip:
            self.file = gzip_open(filename, 'wb')
        else:
            self.file = open(filename, 'wb')
Exemplo n.º 38
0
	def _uncompress_archive(self, app_cache, local_archive):
		try:
			with gzip_open(local_archive) as zipped_file:
				archive_content = zipped_file.read()
				with open(os.path.join(app_cache.get_cache_dir(), '.all.tar'), 'wb') as extracted_file:
					extracted_file.write(archive_content)
		except (zlib.error, EnvironmentError) as exc:
			self.warn('Error while reading %s: %s' % (local_archive, exc))
			return False
		else:
			self._extract_archive(app_cache)
			return True
Exemplo n.º 39
0
	def _load_index_json(self, app_cache):
		index_json_gz_filename = os.path.join(app_cache.get_cache_dir(), '.index.json.gz')
		if not ucr_is_false('appcenter/index/verify'):
			detached_sig_path = index_json_gz_filename + '.gpg'
			(rc, gpg_error) = gpg_verify(index_json_gz_filename, detached_sig_path)
			if rc:
				if gpg_error:
					self.fatal(gpg_error)
				raise Abort('Signature verification for %s failed' % index_json_gz_filename)
		with gzip_open(index_json_gz_filename, 'rb') as fgzip:
			content = fgzip.read()
			return loads(content)
Exemplo n.º 40
0
    def open(self, filename):
        """
        Open specified file for writing.  File will be compressed
        if the gzip flag of the constructor was set to True.

        Args:
            filename (str): path to file to open for writing
        """
        if self.gzip:
            self.file = gzip_open(filename, 'wb')
        else:
            self.file = open(filename, 'wb')
Exemplo n.º 41
0
def get_archive(mid, archive_path):

    if path_isabs(archive_path):
        archive_dir = realpath(archive_path)
    else:
        archive_dir = realpath(expanduser(path_join(getcwd(), archive_path)))

    path = path_join(archive_dir, '%x.gz' % mid)

    with gzip_open(path, 'rb') as f:
        mime = f.read()

    logger.debug('Archive \'%s\' extracted successfully. %d bytes' % (path, len(mime)))

    return mime
Exemplo n.º 42
0
    def __init__(self, filename, dict_ifo, dict_index, compressed = False):
        """Constructor.
        
        Arguments:
        - `filename`: filename of .dict file.
        - `dict_ifo`: IfoFileReader object.
        - `dict_index`: IdxFileReader object.
        """
        self._dict_ifo = dict_ifo
        self._dict_index = dict_index
        self._compressed = compressed
        self._offset = 0

        if self._compressed:
            self.fh = gzip_open(filename, "rb")
        else:
            self.fh = open(filename, "rb")
Exemplo n.º 43
0
def biom_open(fp, permission='U'):
    """Wrapper to allow opening of gzipped or non-compressed files
    
    Read or write the contents of a file

    file_fp : file path
    permission : either 'r','w','a'

    If the file is binary, be sure to pass in a binary mode (append 'b' to
    the mode); opening a binary file in text mode (e.g., in default mode 'U')
    will have unpredictable results.
    
    This code was copied from QIIME (www.qiime.org).
    """
    if is_gzip(fp):
        return gzip_open(fp,'rb')
    else:
        return open(fp, permission)
Exemplo n.º 44
0
def biom_open(fp, permission='U'):
    """Wrapper to allow opening of gzipped or non-compressed files

    Read or write the contents of a file

    file_fp : file path
    permission : either 'r','w','a'

    If the file is binary, be sure to pass in a binary mode (append 'b' to
    the mode); opening a binary file in text mode (e.g., in default mode 'U')
    will have unpredictable results.

    This function is ported from QIIME (http://www.qiime.org), previously named
    qiime_open. QIIME is a GPL project, but we obtained permission from the
    authors of this function to port it to the BIOM Format project (and keep it
    under BIOM's BSD license).
    """
    if is_gzip(fp):
        return gzip_open(fp, 'rb')
    else:
        return open(fp, permission)
Exemplo n.º 45
0
 def __init__(self, outfile, mode='x'):
     """
     Parameters
     ----------
     outfile:  Unicode
         full path output file name
     mode: str
         'w'	open for writing, truncating the file first
         'x'	open for exclusive creation, failing if the file already exists
         'a'	open for writing, appending to the end of the file if it exists
     Raises
     ------
     FileNotFoundError: When the file cannot be opened
     FileExistsError: when infile exist and mode is x
     """
     super().__init__(outfile)
     mode += 'b'
     try:
         self.file_object = gzip_open(outfile, mode)
     except FileExistsError:
         raise FileExistsError('file exists: {} and mode is {}'.
                               format(outfile, mode))
def da_read( fn ):

  assert isfile( fn );

  with gzip_open( fn, "rt" ) as f:

    firstline = f.readline();

    if firstline and firstline[-1] == '\n':
      firstline = firstline[:-1];
    firstline = firstline.split( '\t' );

    has_y = None;

    if firstline[:3] == [ '"id"', '"y"', '"cId"' ]:
      has_y = True;
    elif firstline[:3] == [ '"id"', '"cId"', '"x1"' ]:    
      has_y = False;
    else:
      assert False;

    if has_y:
      assert \
           firstline \
        == (   [ '"id"', '"y"', '"cId"' ]
             + [ '"x{}"'.format(i) for i in range(1,101) ] );
    else:
      assert \
           firstline \
        == (   [ '"id"', '"cId"' ]
             + [ '"x{}"'.format(i) for i in range(1,101) ] );

    x_check = {};

    for line in f:

      if line and line[-1] == '\n':
        line = line[:-1];
      line = line.split( '\t' );

      id_ = line[0];
      id_ = int( id_ );

      if has_y:
        y = line[1];
        assert y in [ "0", "1" ];
        y = int( y );
        rest = line[2:];
      else:
        rest = line[1:];

      c = rest[0];

      assert c[0] == '"';
      assert c[-1] == '"';
      c = int( c[1:-1] );

      b = [];
      x = [];

      for i in range( 1, len(rest) ):

        try:

          val = rest[i];

          if i in BINARY_FEATs:

            assert val in [ "0", "1" ];
            val = int(val)
            b.append( val );
            continue;

          if not '.' in val:
            val = val+'.';
          val = val.split( '.' );

          assert \
            ( ( val[0][0] == '-' ) and ( len(val[0]) == 2 ) ) \
                  or ( ( val[0][0] != '-' ) and ( len(val[0]) == 1 ) );
          assert \
            len( val[1] ) <= 3;

          while len( val[1] ) < 3:
            val[1] = val[1] + '0';

          assert \
            len( val[1] ) == 3;

          if val[0][0] == '-':
            val = - int( val[0][1:] ) * 1000 - int( val[1] );
          else:
            val = int( val[0] ) * 1000 + int( val[1] );          

          assert ( float(val) / 1000.0 ) == float(rest[i]);

          x_check_ = x_check.get( i, set() );
          if len( x_check_ ) < 3:
            x_check_.add( val );
            x_check[ i ] = x_check_;

          x.append( val );

        except:

          print( repr(val), rest[i] );
          raise;

      if has_y:
        yield ( id_, y, [c], b, x );
      else:
        yield ( id_, None, [c], b, x );

    for v in x_check.values():
      assert len( v ) > 2;
def step13( datadir ):

  with gzip_open( datadir+"/train.tsv.gz", "rt" ) as f:

    firstline = f.readline();

    if firstline and firstline[-1] == '\n':
      firstline = firstline[:-1];
    firstline = firstline.split( '\t' );

    assert \
         firstline \
      == (   [ '"id"', '"y"', '"cId"' ]
           + [ '"x{}"'.format(i) for i in range(1,101) ] );

    pos_valsx = [];
    neg_valsx = [];
    pos_rndx = [];
    neg_rndx = [];

    pos_valsx_ = [];
    neg_valsx_ = [];
    pos_rndx_ = [];
    neg_rndx_ = [];

    i = 1;
    for line in f:

      i += 1;
      if i > 10000:
        break;

      line_ = line;

      if line and line[-1] == '\n':
        line = line[:-1];
      line = line.split( '\t' );

      id_ = line[0];
      y = line[1];
      cid = line[2];
      x = [ None ];
      assert cid.startswith( '"' );
      assert cid.endswith( '"' );
      cid = int( cid[1:-1] );

      for x_ in line[3:]:
        x.append( float(x_) )

      relevant_x = x[cid];
      random_x = choice( x[1:] );

      if y == '0':

        neg_valsx.append(relevant_x);
        if relevant_x not in [ 0.0, 1.0 ]:
          neg_valsx_.append(relevant_x);

        neg_rndx.append(random_x);
        if random_x not in [ 0.0, 1.0 ]:
          neg_rndx_.append(random_x);

      elif y == '1':

        pos_valsx.append(relevant_x);
        if relevant_x not in [ 0.0, 1.0 ]:
          pos_valsx_.append(relevant_x);

        pos_rndx.append(random_x);
        if random_x not in [ 0.0, 1.0 ]:
          pos_rndx_.append(random_x);

  ( fig, ax ) = plt.subplots( nrows=2, ncols=2, figsize=(6,6) );
  ax[0,0].hist( [ neg_valsx, pos_valsx ], 100, histtype='step', color='br', linewidth=3 );  
  ax[0,1].hist( [ neg_rndx, pos_rndx ], 100, histtype='step', color='br', linewidth=3 );  
  ax[1,0].hist( [ neg_valsx_, pos_valsx_ ], 100, histtype='step', color='br', linewidth=3 );  
  ax[1,1].hist( [ neg_rndx_, pos_rndx_ ], 100, histtype='step', color='br', linewidth=3 );  
  fig.savefig( datadir+'/step13.png' );
Exemplo n.º 48
0
    def __init__(self, thrift_type, filename,
                 postprocess=None, filetype=FileType.AUTO):
        """
        Args:
            thrift_type: Class for Thrift type, e.g. Communication, TokenLattice
            filename (str):
            postprocess (function): A post-processing function that is called
                with the Thrift object as argument each time a Thrift object
                is read from the file
            filetype (FileType): Expected type of file.  Default value is
                `FileType.AUTO`, where function will try to automatically
                determine file type.

        Raises:
            ValueError: if filetype is not a known filetype name or id
        """
        filetype = FileType.lookup(filetype)

        self._seek_supported = True

        self._thrift_type = thrift_type
        if postprocess is None:
            def _noop(obj):
                return
            self._postprocess = _noop
        else:
            self._postprocess = postprocess
        self._source_filename = filename

        if filetype == FileType.TAR:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|')

        elif filetype == FileType.TAR_GZ:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|gz')

        elif filetype == FileType.TAR_BZ2:
            self.filetype = 'tar'
            self.tar = tarfile.open(filename, 'r|bz2')

        elif filetype == FileType.ZIP:
            self.filetype = 'zip'
            self.zip = zipfile.ZipFile(filename, 'r')
            self.zip_infolist = self.zip.infolist()
            self.zip_infolist_index = 0

        elif filetype == FileType.STREAM:
            self.filetype = 'stream'
            f = open(filename, 'rb')

        elif filetype == FileType.STREAM_GZ:
            self.filetype = 'stream'
            f = gzip_open(filename, 'rb')

        elif filetype == FileType.STREAM_BZ2:
            self.filetype = 'stream'
            f = bz2.BZ2File(filename, 'r')

        elif filetype == FileType.AUTO:
            if tarfile.is_tarfile(filename):
                self.filetype = 'tar'
                self.tar = tarfile.open(filename, 'r|*')

            elif zipfile.is_zipfile(filename):
                self.filetype = 'zip'
                self.zip = zipfile.ZipFile(filename, 'r')
                self.zip_infolist = self.zip.infolist()
                self.zip_infolist_index = 0

            elif mimetypes.guess_type(filename)[1] == 'gzip':
                # this is not a true stream---is_tarfile will have
                # successfully seeked backwards on the file if we have
                # reached this point
                self.filetype = 'stream'
                f = gzip_open(filename, 'rb')

            elif mimetypes.guess_type(filename)[1] == 'bzip2':
                # this is not a true stream
                self.filetype = 'stream'
                f = bz2.BZ2File(filename, 'r')

            else:
                # this is not a true stream
                self.filetype = 'stream'
                f = open(filename, 'rb')

        else:
            raise ValueError('unknown filetype %d' % filetype)

        if self.filetype == 'stream':
            self.transport = TTransport.TFileObjectTransport(f)
            self.protocol = factory.createProtocol(self.transport)
            self.transport.open()
Exemplo n.º 49
0
            n = getattr(new, name)
            print 'runs in old not in new:'
            print sorted(o-n)
            l = sorted(x for x in n-o if x < 157460)
            print 'runs in new not in old before 2011: #', len(l), 'min', l[0] if len(l) else None, 'max', l[-1] if len(l) else None
            l = sorted(x for x in n-o if x >= 157460)
            print 'runs in new not in old during 2011: #', len(l), 'min', l[0] if len(l) else None, 'max', l[-1] if len(l) else None
            print
        sys.exit(0)
    elif 'make_lists' in sys.argv:
        from datetime import datetime
        from gzip import open as gzip_open
        from MuonAnalysis.Cosmics.runregistry import RunRegistryHelper

        epoch = min_time = datetime(2010, 2, 1)
        rrh = RunRegistryHelper(gzip_open('download.xml.gz')) # Get this from the run registry Table->Get Data->Generate... then Table->Get Data->Export->XML (all).

        kinds = [
            ('cosmics', ['Cosmic10', 'Cosmics10', 'Cosmics11']),
            ('commissioning', ['BeamCommissioning10', 'BeamCommisioning10', 'Commissioning', 'Commissioining10', 'Commisioning10', 'Commissioning10', 'Commissioning11']), # "commissioning" is hard to spell
            ('collisions', ['Collisions10', 'PostCollisions10', 'Collisions11']),
            ]

        for kind_label, kind_groups in kinds:
            for det in ['dt', 'csc', 'strip', 'pix', 'rpc']:
                runs = rrh.get_good_runs([det.upper()], min_time, kind_groups)
                print '%s_runs_%s = set(%s)' % (kind_label, det, repr(runs).replace(' ', ''))
            print
    elif 'dump_trigger_menus' in sys.argv:
        from gzip import open as gzip_open
        from MuonAnalysis.Cosmics.runregistry import RunRegistryHelper
Exemplo n.º 50
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args)

    parser = hmmer_args(parser)

    parser.add_argument('MODEL', type=PathType)
    parser.add_argument('SEQUENCES', type=PathType)

    ARGS = parse_args(parser, args, namespace=ns)

    with gzip_open(ARGS.MODEL, 'rb') as fh:
        try:
            model = pickle_load(fh)
            if model[0] != MODEL_VERSION:
                raise ImportError('incompatible model version')
            ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:]
        except ImportError:
            msg = 'your model is not of the appropriate version, please re-learn your model'
            raise RuntimeError(msg)

    # create a temporary file wherein space characters have been removed
    with open(ARGS.SEQUENCES) as seq_fh:

        def seqrecords():
            is_dna = ARGS.ENCODER == DNAEncoder
            seq_fmt = seqfile_format(ARGS.SEQUENCES)
            source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
            try:
                for record in source:
                    yield record if is_dna else translate(record)
            except VerifyError:
                if is_dna:
                    msg = (
                        "your model specifies a DNA encoding "
                        "which is incompatible with protein sequences"
                        )
                    raise RuntimeError(msg)
                source.set_alphabet(AminoAlphabet)
                for record in source:
                    yield record

        try:
            fd, tmphmm = mkstemp(); close(fd)
            with open(tmphmm, 'wb') as hmm_fh:
                hmm_fh.write(hmm)
                # explicitly gc hmm
                hmm = None
            tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS)
            alignment = load_stockholm(tmpaln, trim=True)
        finally:
            if exists(tmphmm):
                remove(tmphmm)
            if exists(tmpaln):
                remove(tmpaln)

    X = extractor.transform(alignment)
    y = clf.predict(X)

    feature_names = extractor.get_feature_names()
    support = clf.named_steps['mrmr'].support_
    labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s]
    emptys = [' ' * (len(label) + 2) for label in labels]
    idlen = max(len(r.id) for r in alignment) + 3

    print('{{\n  "label": "{0:s}",\n  "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT)
    for i, r in enumerate(alignment):
        if i > 0:
            print(',')
        features = ['[ ']
        for j, x in enumerate(X[i, support]):
            if x:
                features.append(labels[j])
                features.append(', ')
            else:
                features.append(emptys[j])
        features.append(' ]')
        # replace the last comma with a space
        idx = None
        for k, f in enumerate(features):
            if f == ', ':
                idx = k
        if idx is None:
            features[0] = features[0].rstrip()
            features[-1] = features[-1].lstrip()
        else:
            features[idx] = ''
        features_ = ''.join(features)
        print(
            '    {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format(
                idlen).format('"{0:s}",'.format(r.id), y[i], features_),
            file=ARGS.OUTPUT, end='')
    print('\n  ]\n}', file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return 0
Exemplo n.º 51
0
 def opener(fp, mode):
     codecs.getwriter('utf-8')(gzip_open(fp, mode))
def step05( datadir ):

  stats_by_b = {};
  stats_by_cid_b = {};

  with gzip_open( datadir+"/train_trn.tsv.gz", "rt" ) as f:

    firstline = f.readline();
    if firstline and firstline[-1] == '\n':
      firstline = firstline[:-1];
    firstline = firstline.split( '\t' );

    assert \
         firstline \
      == (   [ '"id"', '"y"', '"cId"' ]
           + [ '"x{}"'.format(i) for i in range(1,101) ] );

    for line in f:

      if line and line[-1] == '\n':
        line = line[:-1];
      line = line.split( '\t' );

      id_ = line[0];
      y = line[1];
      cid = line[2];

      assert cid[0] == '"';
      assert cid[-1] == '"';
      cid = int( cid[1:-1] );

      x = [ None ];
      b = [];

      for i in range( 3, len(line) ):
        if (i-2) in BINARY_FEATs:
          b.append( line[i] );
        else:
          x.append( line[i] );

      b_ = 0;
      for i in range( 0, len(b) ):
        if b[i] == '0':
          b_i = 0;
        elif b[i] == '1':
          b_i = 1;
        else:
          assert False;
        b_ |= b_i << i;

      (total,pos) = stats_by_b.get( b_, (0,0) );
      total += 1;
      if y == '1':
        pos += 1;        
      stats_by_b[ b_ ] = ( total, pos );

      (total,pos) = stats_by_cid_b.get( (cid,b_), (0,0) );
      total += 1;
      if y == '1':
        pos += 1;        
      stats_by_cid_b[ (cid,b_) ] = ( total, pos );


  with open( datadir+"/step05.txt", "wt" ) as out:

    for b in sorted( stats_by_b ):

      ( total, pos ) = stats_by_b[ b ];
      p = float(pos) / float(total);

      # print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(b), pos, total, p ) );
      print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(b), pos, total, p ), file=out );

    print( "-->", len(stats_by_b) );

    for (cid,b) in sorted( stats_by_cid_b ):

      ( total, pos ) = stats_by_cid_b[ (cid,b) ];
      p = float(pos) / float(total);

      # print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(cid)+'.'+hex(b), pos, total, p ) );
      print( "{:20s};{:7d};{:7d};{:1.4f}".format( hex(cid)+'.'+hex(b), pos, total, p ), file=out );

    print( "-->", len(stats_by_cid_b) );
Exemplo n.º 53
0
 def GzipType(string):
     try:
         return gzip_open(string, 'wb')
     except:
         return ArgumentTypeError("cannot open '{0:s}' for writing".format(string))
Exemplo n.º 54
0
#!/usr/bin/env python

from sys import argv
from gzip import open as gzip_open
from biom.parse import parse_biom_table

if __name__ == '__main__':
    table = parse_biom_table(gzip_open(argv[1]))
    foo = table.transformSamples(lambda x, y, z: x)
Exemplo n.º 55
0
            # use the ratings in the newest created dataset
            datasets.sort(key=lambda ds: self.parse_timestamp(ds.find('CREATE_TIME').text))
            ds = datasets[-1]
            cmps = ds.find('CMPS').findall('CMP')

            if all(self.is_good(cmps, subdet) for subdet in subdets):
                good.append(run_number)

        good.sort()
        return good

if __name__ == '__main__':
    from gzip import open as gzip_open
    min_time = datetime(2010, 2, 1)
    rrh = RunRegistryHelper(gzip_open('download.xml.gz'))
    dt_st = rrh.get_good_runs(['DT', 'STRIP'], min_time)
    dt_px_st = rrh.get_good_runs(['DT', 'PIX', 'STRIP'], min_time)

    # "Histogram" of GROUP_NAMEs used.
    from collections import defaultdict
    from pprint import pprint
    d = defaultdict(list)
    for run in rrh.runs:
        d[rrh.group_name(run)].append((rrh.run_number(run), rrh.start_time(run)))
    to_show = 8
    for k in sorted(d.keys()):
        v = d[k]
        v.sort()
        print k
        if len(v) > to_show: