Пример #1
0
    def get_file_size(self, path, job_name=None, **kwargs):

        # Ignore local paths
        if self.__get_file_protocol(path) == "Local":
            logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!")
            return True

        try:
            # Check if path is prefix, and create StoragePrefix object and get its size
            if path.endswith("*"):
                _size = StoragePrefix(path.rstrip("*")).size

            # Check if it path exists as a file or folder, by creating StorageFile and StorageFolder object
            else:
                _file = StorageFile(path)
                _folder = StorageFolder(path)

                if _file.exists():
                    _size = _file.size
                elif _folder.exists():
                    _size = _folder.size
                else:
                    _size = 0

            # Convert to GB
            return float(_size)/2**30

        except BaseException as e:
            logging.error(f"Unable to get file size: {path}")
            if str(e) != "":
                logging.error(f"Received the following msg:\n{e}")
            raise
Пример #2
0
def transfer_file(to_folder_uri, file_id=None, file_info_href=None):
    # Determine the file_id, file_info and file_content_href
    if file_id is not None:
        file_info_href = "v1pre3/files/%s" % file_id
        file_content_href = "v1pre3/files/%s/content" % file_id
    elif file_info_href is not None:
        file_id = file_info_href.strip("/").split("/")[-1]
        file_content_href = "%s/content" % file_info_href
    else:
        raise ValueError("Either BaseSpace file_id or file_info_href is needed for file transfer.")

    file_info = api_response(file_info_href)
    logger.debug("Transferring file from BaseSpace: %s" % file_content_href)

    # For FASTQ files, add basespace file ID to filename
    # Each MiSeq run may have multiple FASTQ files with the same name.
    filename = file_info.get("Name")
    if filename.endswith(".fastq.gz"):
        filename = filename.replace(".fastq.gz", "_%s.fastq.gz" % file_id)

    # Skip if a file exists and have the same size.
    to_uri = os.path.join(to_folder_uri, filename)
    dest_file = StorageFile(to_uri)
    file_size = file_info.get("Size")
    if file_size and dest_file.exists() and dest_file.size and dest_file.size == file_info.get("Size"):
        logger.debug("File %s exists at destination: %s" % (filename, to_uri))
        return to_uri
    from_uri = build_api_url(file_content_href)
    StorageFile(from_uri).copy(to_uri)
    return to_uri
Пример #3
0
    def get_intraday_series(self, symbol, date=None):
        """Gets a pandas data frame of intraday series data.

        Args:
            symbol (str): The name of the equity/stock.
            date (str, optional): Date, e.g. 2017-02-12. Defaults to None.

        Returns: A pandas data frame of intraday series data for the specific date.
            If date is None, the data of the last trading day will be returned.
            This function will return None,
            if date is None and there is no data available in the last 100 days.

        """
        series_type = self.intraday_series_type
        # requested_date stores the original requested date
        requested_date = date
        day_delta = 0
        df = None
        # When date is specified, empty data frame will be return if there is no data for the specific day.
        # When date is not specified, try to get data of the previous day if there is no data today
        while df is None or (requested_date is None and df.empty
                             and day_delta < 100):
            if requested_date is None:
                date = (datetime.datetime.now() -
                        datetime.timedelta(days=day_delta)).strftime(
                            self.date_fmt)
            logger.debug("Getting data for %s" % date)
            # Get the next date as string for filtering purpose
            # next_date is a string of date, which will be used to compare with data frame index.
            dt_date = datetime.datetime.strptime(date, self.date_fmt)
            dt_next = dt_date.date() + datetime.timedelta(days=1)
            next_date = dt_next.strftime(self.date_fmt)

            if self.cache:
                # Check if data has been cached.
                file_path = self.__cache_file_path(symbol, series_type, date)
                storage_file = StorageFile(file_path)
                if storage_file.exists():
                    logger.debug("Reading existing data... %s" % file_path)
                    with storage_file('r') as f:
                        df = pd.read_csv(f,
                                         index_col=0,
                                         parse_dates=['timestamp'])
                else:
                    df = self.__intraday_get_full_data(symbol)
                    df = df[(df['timestamp'] >= date)
                            & (df['timestamp'] < next_date)]
            else:
                # Request new data
                df = self.__request_data(symbol, series_type, 'full')
                df = df[(df['timestamp'] >= date)
                        & (df['timestamp'] < next_date)]

            day_delta += 1

        if df is not None:
            df.set_index('timestamp', inplace=True)
        df.symbol = symbol
        return df
Пример #4
0
    def get_file_size(self, path, job_name=None, **kwargs):

        retry_count = kwargs.get("retry_count", 0)

        # Ignore local paths
        if self.__get_file_protocol(path) == "Local":
            logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!")
            return True

        if retry_count < 5:
            try:
                # Check if path is prefix, and create StoragePrefix object and get its size
                if path.endswith("*"):
                    _size = StoragePrefix(path.rstrip("*")).size

                # Check if it path exists as a file or folder, by creating StorageFile and StorageFolder object
                else:
                    _file = StorageFile(path)
                    _folder = StorageFolder(path)
                    _size = 0

                    found = False
                    trial_count = 0
                    while not found:

                        if trial_count > 10:
                            logging.error(f"Cannot get size of '{path}' as it doesn't exist after multiple trials!")
                            break

                        time.sleep(trial_count)

                        if _file.exists():
                            _size = _file.size
                            found = True
                        elif _folder.exists():
                            _size = _folder.size
                            found = True
                        else:
                            trial_count += 1
                            logging.warning(f"Cannot get size of '{path}' as it does not exist! Trial {trial_count}/10")

                # Convert to GB
                return float(_size)/2**30

            except BaseException as e:
                logging.error(f"Unable to get file size: {path}")
                if str(e) != "":
                    logging.error(f"Received the following msg:\n{e}")
                if "dictionary changed size" in str(e):
                    kwargs['retry_count'] = retry_count + 1
                    return self.get_file_size(path, job_name, **kwargs)
                raise
        else:
            logging.warning(f"Failed to get size of '{path}'! Attempted to retrieve size {retry_count + 1} times.")
            return 0
Пример #5
0
 def test_upload_from_file(self):
     gs_file = StorageFile("gs://aries_test/local_upload.txt")
     # Try to upload a file that does not exist.
     local_file_non_exist = os.path.join(os.path.dirname(__file__), "abc.txt")
     with self.assertRaises(FileNotFoundError):
         gs_file.upload_from_file(local_file_non_exist)
     # Upload a file and check the content.
     local_file = os.path.join(os.path.dirname(__file__), "fixtures", "test_file.txt")
     gs_file.upload_from_file(local_file)
     self.assertEqual(gs_file.read(), b'This is a local test file.\n')
     gs_file.delete()
Пример #6
0
    def path_exists(self, path, job_name=None, **kwargs):

        # Ignore local paths
        if self.__get_file_protocol(path) == "Local":
            logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!")
            return True

        try:
            logging.debug(f"Checking existence of {path}...")
            # Check if path is prefix, and create StoragePrefix object and check if exists
            if path.endswith("*"):
                return StoragePrefix(path.rstrip("*")).exists()

            # Check if it exists as a file or folder, by creating StorageFile and StorageFolder object
            return StorageFile(path).exists() or StorageFolder(path).exists()

        except RuntimeError as e:
            traceback.print_exc()
            if str(e) != "":
                logging.error(f"StorageHelper error for {job_name}:\n{e}")
            return False
        except:
            traceback.print_exc()
            logging.error(f"Unable to check path existence: {path}")
            raise
Пример #7
0
 def test_text_read(self):
     with StorageFile.init(
             os.path.join(self.TEST_ROOT, "file_in_test_folder")) as f:
         self.assertEqual(f.size, 0)
         self.assertEqual(f.tell(), 0)
         self.assertEqual(f.seek(0, 2), 0)
         self.assertEqual(len(f.read()), 0)
Пример #8
0
def _analyze_barcode(gzip_fastq, json_stats, logger):
    logger.debug(f"Analyzing barcode in {gzip_fastq}")

    logger.debug("Counting reads by barcode...")

    # fastq = File.unzip(gzip_fastq)
    fastq = gzip_fastq.replace(".gz", "")

    with gzip.open(gzip_fastq, 'rb') as gzip_file:
        with open(fastq, "wb") as unzipped_file:
            logger.debug("Unzipping %s to %s ..." % (gzip_fastq, fastq))
            block_size = 1 << 20
            while True:
                block = gzip_file.read(block_size)
                if not block:
                    break
                unzipped_file.write(block)

    barcode_stats = IlluminaFASTQ(fastq).count_by_barcode()

    logger.debug(f"Barcode count: {len(barcode_stats.keys())}")

    with StorageFile.init(json_stats, 'w') as fp:
        json.dump(barcode_stats, fp)

    return json_stats
Пример #9
0
    def __init__(self, file_path):
        file_path = str(file_path)
        if not StorageFile(file_path).exists():
            raise FileNotFoundError("File not found at %s." % file_path)

        self.file_path = file_path
        logger.debug("Initialized Illumina FASTQ object.")
Пример #10
0
    def __get_valid_daily_cache(self, symbol):
        """Gets the latest un-expired cache file for daily data.

        Args:
            symbol (str): The symbol of the equity/stock.

        Returns:
            str: File path if an un-expired cache file exists. Otherwise None.
        """
        for i in range(self.daily_cache_expiration):
            d = datetime.datetime.now() - datetime.timedelta(days=i)
            file_path = self.__cache_file_path(symbol, self.daily_series_type,
                                               d.strftime(self.date_fmt))
            storage_file = StorageFile(file_path)
            if storage_file.exists():
                return storage_file
        return None
Пример #11
0
 def test_gs_read_seek(self):
     # GSFile instance
     with StorageFile.init("gs://aries_test/file_in_root.txt") as gs_file:
         self.assertEqual(gs_file.scheme, "gs")
         # self.assertEqual(str(type(gs_file).__name__), "GSFile")
         self.assertTrue(gs_file.seekable())
         self.assertTrue(gs_file.readable())
         self.assertEqual(gs_file.size, 34)
Пример #12
0
 def __save_data_frame(self, df, symbol, series_type):
     if df.empty:
         logger.info("Data frame is empty.")
         return None
     file_path = self.__cache_file_path(symbol, series_type)
     logger.debug("Saving %s rows to... %s" % (len(df), file_path))
     with StorageFile.init(file_path, 'w') as f:
         df.to_csv(f)
     return file_path
Пример #13
0
    def setUpClass(cls):
        gs.setup_credentials("GOOGLE_CREDENTIALS", os.path.join(os.path.dirname(__file__), "gcp.json"))
        super().setUpClass()
        try:
            # Check if GCP is accessible by listing all the buckets
            storage.Client().list_buckets(max_results=1)
            cls.GCP_ACCESS = True

            # Removes test folder if it is already there
            StorageFolder("gs://aries_test/copy_test/").delete()
            StorageFile("gs://aries_test/copy_test").delete()
            StorageFile("gs://aries_test/abc.txt").delete()
            StorageFile("gs://aries_test/new_file.txt").delete()
            StorageFile("gs://aries_test/moved_file.txt").delete()
            StorageFile("gs://aries_test/local_upload.txt").delete()
        except Exception as ex:
            print("%s: %s" % (type(ex), str(ex)))
            traceback.print_exc()
Пример #14
0
 def test_text_read_write(self):
     # Write a new file
     temp_file_path = os.path.join(self.TEST_ROOT, "temp_file.txt")
     with StorageFile.init(temp_file_path, 'w+') as f:
         self.assertTrue(f.writable())
         self.assertEqual(f.tell(), 0)
         self.assertEqual(f.write("abc"), 3)
         self.assertEqual(f.tell(), 3)
         f.seek(0)
         self.assertEqual(f.read(), "abc")
         # TODO: File may not exist on the cloud until it is closed.
         # self.assertTrue(f.exists())
     f.delete()
Пример #15
0
    def __intraday_get_full_data(self, symbol):
        """Gets the most recent intraday data (which may include data of multiple days.)

        Args:
            symbol (str): The symbol of the equity/stock.

        Returns: A pandas data frame of intraday series data.

        """
        series_type = self.intraday_series_type
        cached_file = self.__intraday_valid_cache(symbol)
        if cached_file:
            logger.debug("Reading cached file: %s" % cached_file.uri)
            with cached_file('r') as f:
                df = pd.read_csv(f, index_col=0, parse_dates=['timestamp'])
            return df
        df = self.__request_data(symbol, series_type, 'full', interval="1min")
        file_path = os.path.join(self.cache, self.__intraday_cache_file_prefix(symbol)) \
            + datetime.datetime.now().strftime(self.intraday_time_fmt)
        logger.debug("Saving intraday data...")
        with StorageFile.init(file_path, 'w') as f:
            df.to_csv(f)
        # Group data by date
        groups = df.groupby(df['timestamp'].dt.normalize())
        # Get the latest date in the data frame
        dates = [str(name).split(" ")[0] for name, _ in groups]
        latest = max(dates)
        for name, group in groups:
            date = str(name).split(" ")[0]
            # The data for a date is complete if there is data at 1600 or the date is not the latest one
            if not group[group.timestamp == date +
                         " 16:00:00"].empty or date < latest:
                date_file_path = self.__cache_file_path(
                    symbol, series_type, date)
                with StorageFile.init(date_file_path, 'w') as f:
                    group.reset_index(drop=True).to_csv(f)
        return df
Пример #16
0
 def __init__(self, uri, annotation_uri):
     super().__init__(uri)
     self.content = StorageFile.init(uri).read()
     if isinstance(self.content, bytes):
         self.content = self.content.decode()
     self.content = self.content.split("\n")
     self.headers = []
     self.variants = []
     self.annotations = self.load_annotation(annotation_uri)
     for line in self.content:
         if not line:
             continue
         if line.startswith("#"):
             self.headers.append(line)
         else:
             key = self.variant_key(line)
             self.variants.append(Variant(line, self.annotations.get(key)))
Пример #17
0
    def peek_barcode(self):
        barcode_dict = {}
        with StorageFile.init(self.file_path, 'rb') as f:
            with gzip.GzipFile(fileobj=f) as gz:
                for i, line in enumerate(gz, start=1):
                    if i > 4000:
                        break
                    # The line containing barcode starts with @
                    if not line.startswith(b"@"):
                        continue
                    if isinstance(line, bytes):
                        line = line.decode()
                    # Raw barcode
                    barcode = line.strip().split(":")[-1]

                    if re.match(self.dual_index_pattern, barcode):
                        barcode = self.convert_barcode(barcode)
                        barcode_dict[barcode] = self.__count_barcode(barcode_dict, barcode, i)
        return barcode_dict
Пример #18
0
    def test_parse_uri(self):
        """Tests parsing GCS URI
        """
        # File
        file_obj = StorageFile("s3://%s/test_file.txt" % self.TEST_BUCKET_NAME)
        self.assertEqual(file_obj.scheme, "s3")
        self.assertEqual(file_obj.path, "/test_file.txt")

        # Folder
        folder_obj = StorageFolder("s3://%s/test_folder" %
                                   self.TEST_BUCKET_NAME)
        self.assertEqual(folder_obj.uri,
                         "s3://%s/test_folder/" % self.TEST_BUCKET_NAME)
        self.assertEqual(folder_obj.scheme, "s3")
        self.assertEqual(folder_obj.path, "/test_folder/")

        # Bucket root
        folder_obj = StorageFolder("s3://%s" % self.TEST_BUCKET_NAME)
        self.assertEqual(folder_obj.uri, "s3://%s/" % self.TEST_BUCKET_NAME)
        self.assertEqual(folder_obj.scheme, "s3")
        self.assertEqual(folder_obj.path, "/")
Пример #19
0
 def read_count(self):
     logger.debug("Counting reads in file %s..." % self.uri)
     self.gzip = gzip.GzipFile(
         fileobj=StorageFile.init(self.uri, "rb").local())
     return len(list(self))
Пример #20
0
 def test_create_and_move_blob(self):
     gs_file = StorageFile("gs://aries_test/new_file.txt")
     self.assertFalse(gs_file.blob.exists())
     gs_file.create()
     self.assertTrue(gs_file.blob.exists())
     dest = "gs://aries_test/moved_file.txt"
     gs_file.move(dest)
     self.assertFalse(gs_file.exists())
     dest_file = StorageFile(dest)
     self.assertTrue(dest_file.exists())
     dest_file.delete()
Пример #21
0
 def copy_from_http(self):
     storage_obj = StorageFile("https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf")
     gs_path = "gs://davelab_temp/qq6/test.pdf"
     storage_obj.copy("gs://davelab_temp/qq6/test.pdf")
     self.assertTrue(StorageFile(gs_path).exists())
     StorageFile(gs_path).delete()
Пример #22
0
    def test_md5(self):
        """
        """
        test_file = os.path.join(os.path.dirname(__file__), "fixtures",
                                 "links.md")
        # Local
        local_file = StorageFile(test_file)
        local_md5 = local_file.md5_hex
        self.assertIsNotNone(local_md5)

        # GCP
        gs_path = "gs://aries_test/links.md"
        local_file.copy(gs_path)
        gs_file = StorageFile(gs_path)
        self.assertEqual(local_md5, gs_file.md5_hex)
        gs_file.delete()

        # AWS
        if os.environ.get("AWS_SECRET_ACCESS_KEY") and os.environ.get(
                "AWS_ACCESS_KEY_ID"):
            s3_path = "s3://davelab-test/links.md"
            local_file.copy(s3_path)
            s3_file = StorageFile(s3_path)
            self.assertEqual(local_md5, s3_file.md5_hex)
            s3_file.delete()
Пример #23
0
 def create_file(cls, relative_path, content):
     """Creates a file relative to the test root
     """
     abs_path = os.path.join(cls.TEST_ROOT, relative_path)
     with StorageFile.init(abs_path, "w") as f:
         f.write(content)
Пример #24
0
    def test_create_copy_and_delete_file(self):
        new_folder_uri = os.path.join(self.TEST_ROOT, "new_folder")
        with TempFolder(new_folder_uri) as folder:
            self.assertTrue(folder.is_empty())

            # Create a sub folder inside the new folder
            sub_folder_uri = os.path.join(new_folder_uri, "sub_folder")
            logger.debug(sub_folder_uri)
            sub_folder = StorageFolder(sub_folder_uri).create()
            self.assertTrue(sub_folder.exists())

            # Copy an empty file
            src_file_path = os.path.join(self.TEST_ROOT, "test_folder_0",
                                         "empty_file")
            dst_file_path = os.path.join(new_folder_uri, "copied_file")
            f = StorageFile(src_file_path)
            logger.debug(f.exists())
            time.sleep(2)
            f.copy(dst_file_path)
            self.assertTrue(StorageFile(dst_file_path).exists())

            # Copy a file with content and replace the empty file
            src_file_path = os.path.join(self.TEST_ROOT, "test_folder_0",
                                         "abc.txt")
            dst_file_path = os.path.join(new_folder_uri, "copied_file")
            f = StorageFile(src_file_path)
            f.copy(dst_file_path)
            dst_file = StorageFile(dst_file_path)
            self.assertTrue(dst_file.exists())
            # Use the shortcut to read file, the content will be binary.
            self.assertEqual(dst_file.read(), b"abc\ncba\n")

            # Empty the folder. This should delete file and sub folder only
            folder.empty()
            self.assertTrue(folder.exists())
            self.assertTrue(folder.is_empty())
            self.assertFalse(sub_folder.exists())
            self.assertFalse(dst_file.exists())
Пример #25
0
 def test_binary_read_write(self):
     # File does not exist, a new one will be created
     file_uri = os.path.join(self.TEST_ROOT, "test.txt")
     storage_file = StorageFile(file_uri).open("wb")
     self.assertEqual(storage_file.scheme, self.SCHEME)
     self.assertTrue(storage_file.seekable())
     self.assertFalse(storage_file.readable())
     self.assertEqual(storage_file.write(b"abc"), 3)
     self.assertEqual(storage_file.tell(), 3)
     self.assertEqual(storage_file.write(b"def"), 3)
     self.assertEqual(storage_file.tell(), 6)
     storage_file.close()
     self.assertTrue(storage_file.exists())
     storage_file.open('rb')
     self.assertEqual(storage_file.read(), b"abcdef")
     storage_file.close()
     storage_file.delete()
     self.assertFalse(storage_file.exists())
Пример #26
0
    def test_http(self):
        """
        """
        # URL does not exist
        storage_obj = StorageFile("http://example.com/abc/")
        self.assertFalse(storage_obj.exists())

        # URL exists
        storage_obj = StorageFile("https://www.google.com")
        self.assertTrue(storage_obj.exists())

        # Download. Copy to local file.
        storage_obj = StorageFile("https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf")
        local_file_path = os.path.join(self.test_folder_path, "test.pdf")
        if os.path.exists(local_file_path):
            os.remove(local_file_path)
        storage_obj.copy(local_file_path)
        self.assertTrue(os.path.exists(local_file_path))
        self.assertGreater(StorageFile(local_file_path).size, 0)
        StorageFile(local_file_path).delete()
Пример #27
0
 def __init__(self, uri):
     self.uri = uri
     self.gzip = gzip.GzipFile(fileobj=StorageFile.init(uri, "rb"))
     self.current = 0
Пример #28
0
    def test_gs_file(self):
        """Tests accessing a Google Cloud Storage file.
        """
        # Test the blob property
        # File exists
        gs_file_exists = StorageFile("gs://aries_test/file_in_root.txt")
        self.assertFalse(gs_file_exists.is_gz())
        self.assertTrue(gs_file_exists.blob.exists())
        self.assertEqual(gs_file_exists.size, 34)
        # File does not exists
        gs_file_null = StorageFile("gs://aries_test/abc.txt")
        self.assertFalse(gs_file_null.blob.exists())

        # Test the read() method
        self.assertEqual(gs_file_exists.read(), b'This is a file in the bucket root.')
        with self.assertRaises(Exception):
            gs_file_null.read()

        # Test write into a new file
        with gs_file_null('w+b') as f:
            f.write(b"abc")
            f.seek(0)
            self.assertEqual(f.read(), b"abc")

        # File will be uploaded to bucket after closed.
        # Test reading from the bucket
        self.assertEqual(gs_file_null.read(), b"abc")
        gs_file_null.delete()