Пример #1
0
def gzopen_without_timestamps(name, mode="r", fileobj=None, compresslevel=9, **kwargs):
    """ !! Method overrided by laso to pass mtime=0 (!=None) to avoid time.time() was
        setted in Gzip file causing md5 to change. Not possible using the
        previous tarfile open because arguments are not passed to GzipFile constructor
    """
    from tarfile import CompressionError, ReadError

    if mode not in ("r", "w"):
        raise ValueError("mode must be 'r' or 'w'")

    try:
        import gzip
        gzip.GzipFile
    except (ImportError, AttributeError):
        raise CompressionError("gzip module is not available")

    try:
        fileobj = gzip.GzipFile(name, mode, compresslevel, fileobj, mtime=0)
    except OSError:
        if fileobj is not None and mode == 'r':
            raise ReadError("not a gzip file")
        raise

    try:
        t = tarfile.TarFile.taropen(name, mode, fileobj, **kwargs)
    except IOError:
        fileobj.close()
        if mode == 'r':
            raise ReadError("not a gzip file")
        raise
    except:
        fileobj.close()
        raise
    t._extfileobj = False
    return t
Пример #2
0
def gzopen_without_timestamps(name,
                              mode="r",
                              fileobj=None,
                              compresslevel=None,
                              **kwargs):
    """ !! Method overrided by laso to pass mtime=0 (!=None) to avoid time.time() was
        setted in Gzip file causing md5 to change. Not possible using the
        previous tarfile open because arguments are not passed to GzipFile constructor
    """
    from tarfile import CompressionError, ReadError

    compresslevel = compresslevel or int(
        os.getenv("CONAN_COMPRESSION_LEVEL", 9))

    if mode not in ("r", "w"):
        raise ValueError("mode must be 'r' or 'w'")

    try:
        import gzip
        gzip.GzipFile
    except (ImportError, AttributeError):
        raise CompressionError("gzip module is not available")

    try:
        fileobj = gzip.GzipFile(name, mode, compresslevel, fileobj, mtime=0)
    except OSError:
        if fileobj is not None and mode == 'r':
            raise ReadError("not a gzip file")
        raise

    try:
        # Format is forced because in Python3.8, it changed and it generates different tarfiles
        # with different checksums, which break hashes of tgzs
        t = tarfile.TarFile.taropen(name,
                                    mode,
                                    fileobj,
                                    format=tarfile.GNU_FORMAT,
                                    **kwargs)
    except IOError:
        fileobj.close()
        if mode == 'r':
            raise ReadError("not a gzip file")
        raise
    except Exception:
        fileobj.close()
        raise
    t._extfileobj = False
    return t
Пример #3
0
    def extract_archive(self):
        if os.path.isfile(self.filename) and str(self.filename).endswith('.tar.gz'):
            tar = tarfile.open(self.filename)
            tar.extractall()
            tar.close()

        elif not str(self.filename).endswith('.tar.gz'):
            raise ReadError('Not a tar archive')
        else:
            raise FileNotFoundError
Пример #4
0
 def open(cls, name=None, fileobj=None, bufsize=10240, **kwargs):
     '''Disable the write interface'''
     for comptype in cls.OPEN_METH:
         func = getattr(cls, cls.OPEN_METH[comptype])
         if fileobj is not None:
             saved_pos = fileobj.tell()
         try:
             return func(name, "r", fileobj, **kwargs)
         except (ReadError, CompressionError), e:
             if fileobj is not None:
                 fileobj.seek(saved_pos)
             continue
         raise ReadError("file could not be opened successfully.")
Пример #5
0
def geo_download(geo_id, series_path, geo_platforms, clean=True, decompress=True):
    """Downloads the IDATs and metadata for a GEO series

    Arguments:
        geo_id [required]
            the GEO Accension for the desired series (e.g. GSE134293)
        series_path [required]
            the directory to download the data to
        geo_platforms [required]
            the list of supported GEO platforms
        clean
            whether or not to delete files once they are no longer need (True by default)

    Note about GEO IDs:
        You can use the NIH online search to find data sets, then click "Send to:" at the button of a results page,
        and export a list of unique IDs as text file. These IDs are not GEO_IDs used here. First, remove the first
        three digits from the number, so Series ID: 200134293 is GEO accension ID: 134293, then include the GSE part,
        like "GSE134293" in your CLI parameters.

    This function returns True or False, depending on whether the downloaded data is correct."""
    success = True
    series_dir = Path(series_path)
    raw_filename = f"{geo_id}_RAW.tar"
    miniml_filename = f"{geo_id}_family.xml"

    if not os.path.exists(series_path):
        raise FileNotFoundError(f'{geo_id} directory not found.')

    for platform in geo_platforms:
        if not Path(f"{series_path}/{platform}").exists():
            Path(f"{series_path}/{platform}").mkdir()

    ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=120) # 2 mins
    ftp.login()
    ftp.cwd(f"geo/series/{geo_id[:-3]}nnn/{geo_id}")

    try:
        filesize = ftp.size(f"miniml/{miniml_filename}.tgz") # -- gives 550 error because CWD puts it in ASCII mode.
        #LOGGER.info(f"DEBUG ftp.size WORKED: miniml/{miniml_filename}.tgz -- {filesize}")
    except Exception as e:
        LOGGER.error(f"ftp.size ERROR: {e}")

    if not Path(f"{series_path}/{miniml_filename}").exists():
        if not Path(f"{series_path}/{miniml_filename}.tgz").exists():
            LOGGER.info(f"Downloading {miniml_filename}")
            miniml_file = open(f"{series_path}/{miniml_filename}.tgz", 'wb')
            try:
                #filesize = ftp.size(f"miniml/{miniml_filename}.tgz") -- gives 550 error because CWD puts it in ASCII mode.
                for filename,filestats in ftp.mlsd(path="miniml", facts=["size"]):
                    if filename == miniml_filename:
                        filesize = filestats['size']
                        break
                with tqdm(unit = 'b', unit_scale = True, leave = False, miniters = 1, desc = geo_id, total = filesize) as tqdm_instance:
                    def tqdm_callback(data):
                        tqdm_instance.update(len(data))
                        miniml_file.write(data)
                    ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", tqdm_callback)
            except Exception as e:
                LOGGER.error(e)
                LOGGER.info('tqdm: Failed to create a progress bar, but it is downloading...')
                ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", miniml_file.write)
            miniml_file.close()
            #LOGGER.info(f"Downloaded {miniml_filename}")
        #ftp.quit() # instead of 'close()'
        #LOGGER.info(f"Unpacking {miniml_filename}")
        min_tar = tarfile.open(f"{series_path}/{miniml_filename}.tgz")
        for file in min_tar.getnames():
            if file == miniml_filename:
                min_tar.extract(file, path=series_path)
        min_tar.close()
        if clean:
            Path(f"{series_path}/{miniml_filename}.tgz").unlink()
    ftp.quit()

    if list(series_dir.glob('*.idat.gz')) == [] and list(series_dir.glob('**/*.idat')) == []:
        if not Path(f"{series_path}/{raw_filename}").exists():
            ftp = FTP('ftp.ncbi.nlm.nih.gov',
                      timeout=59)  # see issue https://bugs.python.org/issue30956 (must be <60s because of a bug)
            ftp.login()
            ftp.cwd(f"geo/series/{geo_id[:-3]}nnn/{geo_id}")
            raw_file = open(f"{series_path}/{raw_filename}", 'wb')
            filesize = ftp.size(f"suppl/{raw_filename}")
            try:
                try:
                    with tqdm(unit = 'b', unit_scale = True, leave = False, miniters = 1, desc = geo_id, total = filesize) as tqdm_instance:
                        def tqdm_callback(data):
                            tqdm_instance.update(len(data))
                            raw_file.write(data)
                        ftp.retrbinary(f"RETR suppl/{raw_filename}", tqdm_callback)
                except Exception as e:
                    LOGGER.info('tqdm: Failed to create a progress bar, but it is downloading...')
                    ftp.retrbinary(f"RETR suppl/{raw_filename}", raw_file.write)
                ftp.quit()
            except socket.timeout as e:
                LOGGER.warning(f"FTP timeout error.")
                # seems to happen AFTER download is done, so just ignoring it.
            LOGGER.info(f"Closing file {raw_filename}")
            raw_file.close()
            #LOGGER.info(f"Downloaded {raw_filename}")
        LOGGER.info(f"Unpacking {raw_filename}")
        try:
            tar = tarfile.open(f"{series_path}/{raw_filename}")
            # let user know if this lack idats
            if not any([(True if '.idat' in member.name else False) for member in list(tar.getmembers())]):
                file_endings = Counter([tuple(PurePath(member.name).suffixes) for member in list(tar.getmembers())])
                file_endings = [(k,v) for k,v in file_endings.most_common() if v > 1]
                LOGGER.warning(f'No idat files found in {raw_filename}. {len(list(tar.getmembers()))} files found: {file_endings}.')
                success = False
            for member in tar.getmembers():
                if re.match('.*.idat.gz', member.name):
                    tar.extract(member, path=series_path)
        except ReadError as e:
            raise ReadError(f"There appears to be an incomplete download of {geo_id}. Please delete those files and run this again.")
            success = False
        tar.close()
        if clean:
            os.remove(f"{series_path}/{raw_filename}")

    if not decompress:
        pass #LOGGER.info(f"Not decompressing {geo_id} IDAT files")
    else:
        #LOGGER.info(f"Decompressing {geo_id} IDAT files")
        for gz in series_dir.glob("*.idat.gz"):
            gz_string = str(gz)
            with gzip.open(gz_string, 'rb') as f_in:
                with open(gz_string[:-3], 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            if clean:
                gz.unlink() #os.remove(gz_string)

    if not decompress:
        LOGGER.info(f"Downloaded {geo_id} idats without decompressing")
    else:
        LOGGER.info(f"Downloaded and unpacked {geo_id} idats")
    return success