def get_file_size(self, path, job_name=None, **kwargs): # Ignore local paths if self.__get_file_protocol(path) == "Local": logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!") return True try: # Check if path is prefix, and create StoragePrefix object and get its size if path.endswith("*"): _size = StoragePrefix(path.rstrip("*")).size # Check if it path exists as a file or folder, by creating StorageFile and StorageFolder object else: _file = StorageFile(path) _folder = StorageFolder(path) if _file.exists(): _size = _file.size elif _folder.exists(): _size = _folder.size else: _size = 0 # Convert to GB return float(_size)/2**30 except BaseException as e: logging.error(f"Unable to get file size: {path}") if str(e) != "": logging.error(f"Received the following msg:\n{e}") raise
def transfer_file(to_folder_uri, file_id=None, file_info_href=None): # Determine the file_id, file_info and file_content_href if file_id is not None: file_info_href = "v1pre3/files/%s" % file_id file_content_href = "v1pre3/files/%s/content" % file_id elif file_info_href is not None: file_id = file_info_href.strip("/").split("/")[-1] file_content_href = "%s/content" % file_info_href else: raise ValueError("Either BaseSpace file_id or file_info_href is needed for file transfer.") file_info = api_response(file_info_href) logger.debug("Transferring file from BaseSpace: %s" % file_content_href) # For FASTQ files, add basespace file ID to filename # Each MiSeq run may have multiple FASTQ files with the same name. filename = file_info.get("Name") if filename.endswith(".fastq.gz"): filename = filename.replace(".fastq.gz", "_%s.fastq.gz" % file_id) # Skip if a file exists and have the same size. to_uri = os.path.join(to_folder_uri, filename) dest_file = StorageFile(to_uri) file_size = file_info.get("Size") if file_size and dest_file.exists() and dest_file.size and dest_file.size == file_info.get("Size"): logger.debug("File %s exists at destination: %s" % (filename, to_uri)) return to_uri from_uri = build_api_url(file_content_href) StorageFile(from_uri).copy(to_uri) return to_uri
def get_intraday_series(self, symbol, date=None): """Gets a pandas data frame of intraday series data. Args: symbol (str): The name of the equity/stock. date (str, optional): Date, e.g. 2017-02-12. Defaults to None. Returns: A pandas data frame of intraday series data for the specific date. If date is None, the data of the last trading day will be returned. This function will return None, if date is None and there is no data available in the last 100 days. """ series_type = self.intraday_series_type # requested_date stores the original requested date requested_date = date day_delta = 0 df = None # When date is specified, empty data frame will be return if there is no data for the specific day. # When date is not specified, try to get data of the previous day if there is no data today while df is None or (requested_date is None and df.empty and day_delta < 100): if requested_date is None: date = (datetime.datetime.now() - datetime.timedelta(days=day_delta)).strftime( self.date_fmt) logger.debug("Getting data for %s" % date) # Get the next date as string for filtering purpose # next_date is a string of date, which will be used to compare with data frame index. dt_date = datetime.datetime.strptime(date, self.date_fmt) dt_next = dt_date.date() + datetime.timedelta(days=1) next_date = dt_next.strftime(self.date_fmt) if self.cache: # Check if data has been cached. file_path = self.__cache_file_path(symbol, series_type, date) storage_file = StorageFile(file_path) if storage_file.exists(): logger.debug("Reading existing data... %s" % file_path) with storage_file('r') as f: df = pd.read_csv(f, index_col=0, parse_dates=['timestamp']) else: df = self.__intraday_get_full_data(symbol) df = df[(df['timestamp'] >= date) & (df['timestamp'] < next_date)] else: # Request new data df = self.__request_data(symbol, series_type, 'full') df = df[(df['timestamp'] >= date) & (df['timestamp'] < next_date)] day_delta += 1 if df is not None: df.set_index('timestamp', inplace=True) df.symbol = symbol return df
def get_file_size(self, path, job_name=None, **kwargs): retry_count = kwargs.get("retry_count", 0) # Ignore local paths if self.__get_file_protocol(path) == "Local": logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!") return True if retry_count < 5: try: # Check if path is prefix, and create StoragePrefix object and get its size if path.endswith("*"): _size = StoragePrefix(path.rstrip("*")).size # Check if it path exists as a file or folder, by creating StorageFile and StorageFolder object else: _file = StorageFile(path) _folder = StorageFolder(path) _size = 0 found = False trial_count = 0 while not found: if trial_count > 10: logging.error(f"Cannot get size of '{path}' as it doesn't exist after multiple trials!") break time.sleep(trial_count) if _file.exists(): _size = _file.size found = True elif _folder.exists(): _size = _folder.size found = True else: trial_count += 1 logging.warning(f"Cannot get size of '{path}' as it does not exist! Trial {trial_count}/10") # Convert to GB return float(_size)/2**30 except BaseException as e: logging.error(f"Unable to get file size: {path}") if str(e) != "": logging.error(f"Received the following msg:\n{e}") if "dictionary changed size" in str(e): kwargs['retry_count'] = retry_count + 1 return self.get_file_size(path, job_name, **kwargs) raise else: logging.warning(f"Failed to get size of '{path}'! Attempted to retrieve size {retry_count + 1} times.") return 0
def test_upload_from_file(self): gs_file = StorageFile("gs://aries_test/local_upload.txt") # Try to upload a file that does not exist. local_file_non_exist = os.path.join(os.path.dirname(__file__), "abc.txt") with self.assertRaises(FileNotFoundError): gs_file.upload_from_file(local_file_non_exist) # Upload a file and check the content. local_file = os.path.join(os.path.dirname(__file__), "fixtures", "test_file.txt") gs_file.upload_from_file(local_file) self.assertEqual(gs_file.read(), b'This is a local test file.\n') gs_file.delete()
def path_exists(self, path, job_name=None, **kwargs): # Ignore local paths if self.__get_file_protocol(path) == "Local": logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!") return True try: logging.debug(f"Checking existence of {path}...") # Check if path is prefix, and create StoragePrefix object and check if exists if path.endswith("*"): return StoragePrefix(path.rstrip("*")).exists() # Check if it exists as a file or folder, by creating StorageFile and StorageFolder object return StorageFile(path).exists() or StorageFolder(path).exists() except RuntimeError as e: traceback.print_exc() if str(e) != "": logging.error(f"StorageHelper error for {job_name}:\n{e}") return False except: traceback.print_exc() logging.error(f"Unable to check path existence: {path}") raise
def test_text_read(self): with StorageFile.init( os.path.join(self.TEST_ROOT, "file_in_test_folder")) as f: self.assertEqual(f.size, 0) self.assertEqual(f.tell(), 0) self.assertEqual(f.seek(0, 2), 0) self.assertEqual(len(f.read()), 0)
def _analyze_barcode(gzip_fastq, json_stats, logger): logger.debug(f"Analyzing barcode in {gzip_fastq}") logger.debug("Counting reads by barcode...") # fastq = File.unzip(gzip_fastq) fastq = gzip_fastq.replace(".gz", "") with gzip.open(gzip_fastq, 'rb') as gzip_file: with open(fastq, "wb") as unzipped_file: logger.debug("Unzipping %s to %s ..." % (gzip_fastq, fastq)) block_size = 1 << 20 while True: block = gzip_file.read(block_size) if not block: break unzipped_file.write(block) barcode_stats = IlluminaFASTQ(fastq).count_by_barcode() logger.debug(f"Barcode count: {len(barcode_stats.keys())}") with StorageFile.init(json_stats, 'w') as fp: json.dump(barcode_stats, fp) return json_stats
def __init__(self, file_path): file_path = str(file_path) if not StorageFile(file_path).exists(): raise FileNotFoundError("File not found at %s." % file_path) self.file_path = file_path logger.debug("Initialized Illumina FASTQ object.")
def __get_valid_daily_cache(self, symbol): """Gets the latest un-expired cache file for daily data. Args: symbol (str): The symbol of the equity/stock. Returns: str: File path if an un-expired cache file exists. Otherwise None. """ for i in range(self.daily_cache_expiration): d = datetime.datetime.now() - datetime.timedelta(days=i) file_path = self.__cache_file_path(symbol, self.daily_series_type, d.strftime(self.date_fmt)) storage_file = StorageFile(file_path) if storage_file.exists(): return storage_file return None
def test_gs_read_seek(self): # GSFile instance with StorageFile.init("gs://aries_test/file_in_root.txt") as gs_file: self.assertEqual(gs_file.scheme, "gs") # self.assertEqual(str(type(gs_file).__name__), "GSFile") self.assertTrue(gs_file.seekable()) self.assertTrue(gs_file.readable()) self.assertEqual(gs_file.size, 34)
def __save_data_frame(self, df, symbol, series_type): if df.empty: logger.info("Data frame is empty.") return None file_path = self.__cache_file_path(symbol, series_type) logger.debug("Saving %s rows to... %s" % (len(df), file_path)) with StorageFile.init(file_path, 'w') as f: df.to_csv(f) return file_path
def setUpClass(cls): gs.setup_credentials("GOOGLE_CREDENTIALS", os.path.join(os.path.dirname(__file__), "gcp.json")) super().setUpClass() try: # Check if GCP is accessible by listing all the buckets storage.Client().list_buckets(max_results=1) cls.GCP_ACCESS = True # Removes test folder if it is already there StorageFolder("gs://aries_test/copy_test/").delete() StorageFile("gs://aries_test/copy_test").delete() StorageFile("gs://aries_test/abc.txt").delete() StorageFile("gs://aries_test/new_file.txt").delete() StorageFile("gs://aries_test/moved_file.txt").delete() StorageFile("gs://aries_test/local_upload.txt").delete() except Exception as ex: print("%s: %s" % (type(ex), str(ex))) traceback.print_exc()
def test_text_read_write(self): # Write a new file temp_file_path = os.path.join(self.TEST_ROOT, "temp_file.txt") with StorageFile.init(temp_file_path, 'w+') as f: self.assertTrue(f.writable()) self.assertEqual(f.tell(), 0) self.assertEqual(f.write("abc"), 3) self.assertEqual(f.tell(), 3) f.seek(0) self.assertEqual(f.read(), "abc") # TODO: File may not exist on the cloud until it is closed. # self.assertTrue(f.exists()) f.delete()
def __intraday_get_full_data(self, symbol): """Gets the most recent intraday data (which may include data of multiple days.) Args: symbol (str): The symbol of the equity/stock. Returns: A pandas data frame of intraday series data. """ series_type = self.intraday_series_type cached_file = self.__intraday_valid_cache(symbol) if cached_file: logger.debug("Reading cached file: %s" % cached_file.uri) with cached_file('r') as f: df = pd.read_csv(f, index_col=0, parse_dates=['timestamp']) return df df = self.__request_data(symbol, series_type, 'full', interval="1min") file_path = os.path.join(self.cache, self.__intraday_cache_file_prefix(symbol)) \ + datetime.datetime.now().strftime(self.intraday_time_fmt) logger.debug("Saving intraday data...") with StorageFile.init(file_path, 'w') as f: df.to_csv(f) # Group data by date groups = df.groupby(df['timestamp'].dt.normalize()) # Get the latest date in the data frame dates = [str(name).split(" ")[0] for name, _ in groups] latest = max(dates) for name, group in groups: date = str(name).split(" ")[0] # The data for a date is complete if there is data at 1600 or the date is not the latest one if not group[group.timestamp == date + " 16:00:00"].empty or date < latest: date_file_path = self.__cache_file_path( symbol, series_type, date) with StorageFile.init(date_file_path, 'w') as f: group.reset_index(drop=True).to_csv(f) return df
def __init__(self, uri, annotation_uri): super().__init__(uri) self.content = StorageFile.init(uri).read() if isinstance(self.content, bytes): self.content = self.content.decode() self.content = self.content.split("\n") self.headers = [] self.variants = [] self.annotations = self.load_annotation(annotation_uri) for line in self.content: if not line: continue if line.startswith("#"): self.headers.append(line) else: key = self.variant_key(line) self.variants.append(Variant(line, self.annotations.get(key)))
def peek_barcode(self): barcode_dict = {} with StorageFile.init(self.file_path, 'rb') as f: with gzip.GzipFile(fileobj=f) as gz: for i, line in enumerate(gz, start=1): if i > 4000: break # The line containing barcode starts with @ if not line.startswith(b"@"): continue if isinstance(line, bytes): line = line.decode() # Raw barcode barcode = line.strip().split(":")[-1] if re.match(self.dual_index_pattern, barcode): barcode = self.convert_barcode(barcode) barcode_dict[barcode] = self.__count_barcode(barcode_dict, barcode, i) return barcode_dict
def test_parse_uri(self): """Tests parsing GCS URI """ # File file_obj = StorageFile("s3://%s/test_file.txt" % self.TEST_BUCKET_NAME) self.assertEqual(file_obj.scheme, "s3") self.assertEqual(file_obj.path, "/test_file.txt") # Folder folder_obj = StorageFolder("s3://%s/test_folder" % self.TEST_BUCKET_NAME) self.assertEqual(folder_obj.uri, "s3://%s/test_folder/" % self.TEST_BUCKET_NAME) self.assertEqual(folder_obj.scheme, "s3") self.assertEqual(folder_obj.path, "/test_folder/") # Bucket root folder_obj = StorageFolder("s3://%s" % self.TEST_BUCKET_NAME) self.assertEqual(folder_obj.uri, "s3://%s/" % self.TEST_BUCKET_NAME) self.assertEqual(folder_obj.scheme, "s3") self.assertEqual(folder_obj.path, "/")
def read_count(self): logger.debug("Counting reads in file %s..." % self.uri) self.gzip = gzip.GzipFile( fileobj=StorageFile.init(self.uri, "rb").local()) return len(list(self))
def test_create_and_move_blob(self): gs_file = StorageFile("gs://aries_test/new_file.txt") self.assertFalse(gs_file.blob.exists()) gs_file.create() self.assertTrue(gs_file.blob.exists()) dest = "gs://aries_test/moved_file.txt" gs_file.move(dest) self.assertFalse(gs_file.exists()) dest_file = StorageFile(dest) self.assertTrue(dest_file.exists()) dest_file.delete()
def copy_from_http(self): storage_obj = StorageFile("https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf") gs_path = "gs://davelab_temp/qq6/test.pdf" storage_obj.copy("gs://davelab_temp/qq6/test.pdf") self.assertTrue(StorageFile(gs_path).exists()) StorageFile(gs_path).delete()
def test_md5(self): """ """ test_file = os.path.join(os.path.dirname(__file__), "fixtures", "links.md") # Local local_file = StorageFile(test_file) local_md5 = local_file.md5_hex self.assertIsNotNone(local_md5) # GCP gs_path = "gs://aries_test/links.md" local_file.copy(gs_path) gs_file = StorageFile(gs_path) self.assertEqual(local_md5, gs_file.md5_hex) gs_file.delete() # AWS if os.environ.get("AWS_SECRET_ACCESS_KEY") and os.environ.get( "AWS_ACCESS_KEY_ID"): s3_path = "s3://davelab-test/links.md" local_file.copy(s3_path) s3_file = StorageFile(s3_path) self.assertEqual(local_md5, s3_file.md5_hex) s3_file.delete()
def create_file(cls, relative_path, content): """Creates a file relative to the test root """ abs_path = os.path.join(cls.TEST_ROOT, relative_path) with StorageFile.init(abs_path, "w") as f: f.write(content)
def test_create_copy_and_delete_file(self): new_folder_uri = os.path.join(self.TEST_ROOT, "new_folder") with TempFolder(new_folder_uri) as folder: self.assertTrue(folder.is_empty()) # Create a sub folder inside the new folder sub_folder_uri = os.path.join(new_folder_uri, "sub_folder") logger.debug(sub_folder_uri) sub_folder = StorageFolder(sub_folder_uri).create() self.assertTrue(sub_folder.exists()) # Copy an empty file src_file_path = os.path.join(self.TEST_ROOT, "test_folder_0", "empty_file") dst_file_path = os.path.join(new_folder_uri, "copied_file") f = StorageFile(src_file_path) logger.debug(f.exists()) time.sleep(2) f.copy(dst_file_path) self.assertTrue(StorageFile(dst_file_path).exists()) # Copy a file with content and replace the empty file src_file_path = os.path.join(self.TEST_ROOT, "test_folder_0", "abc.txt") dst_file_path = os.path.join(new_folder_uri, "copied_file") f = StorageFile(src_file_path) f.copy(dst_file_path) dst_file = StorageFile(dst_file_path) self.assertTrue(dst_file.exists()) # Use the shortcut to read file, the content will be binary. self.assertEqual(dst_file.read(), b"abc\ncba\n") # Empty the folder. This should delete file and sub folder only folder.empty() self.assertTrue(folder.exists()) self.assertTrue(folder.is_empty()) self.assertFalse(sub_folder.exists()) self.assertFalse(dst_file.exists())
def test_binary_read_write(self): # File does not exist, a new one will be created file_uri = os.path.join(self.TEST_ROOT, "test.txt") storage_file = StorageFile(file_uri).open("wb") self.assertEqual(storage_file.scheme, self.SCHEME) self.assertTrue(storage_file.seekable()) self.assertFalse(storage_file.readable()) self.assertEqual(storage_file.write(b"abc"), 3) self.assertEqual(storage_file.tell(), 3) self.assertEqual(storage_file.write(b"def"), 3) self.assertEqual(storage_file.tell(), 6) storage_file.close() self.assertTrue(storage_file.exists()) storage_file.open('rb') self.assertEqual(storage_file.read(), b"abcdef") storage_file.close() storage_file.delete() self.assertFalse(storage_file.exists())
def test_http(self): """ """ # URL does not exist storage_obj = StorageFile("http://example.com/abc/") self.assertFalse(storage_obj.exists()) # URL exists storage_obj = StorageFile("https://www.google.com") self.assertTrue(storage_obj.exists()) # Download. Copy to local file. storage_obj = StorageFile("https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf") local_file_path = os.path.join(self.test_folder_path, "test.pdf") if os.path.exists(local_file_path): os.remove(local_file_path) storage_obj.copy(local_file_path) self.assertTrue(os.path.exists(local_file_path)) self.assertGreater(StorageFile(local_file_path).size, 0) StorageFile(local_file_path).delete()
def __init__(self, uri): self.uri = uri self.gzip = gzip.GzipFile(fileobj=StorageFile.init(uri, "rb")) self.current = 0
def test_gs_file(self): """Tests accessing a Google Cloud Storage file. """ # Test the blob property # File exists gs_file_exists = StorageFile("gs://aries_test/file_in_root.txt") self.assertFalse(gs_file_exists.is_gz()) self.assertTrue(gs_file_exists.blob.exists()) self.assertEqual(gs_file_exists.size, 34) # File does not exists gs_file_null = StorageFile("gs://aries_test/abc.txt") self.assertFalse(gs_file_null.blob.exists()) # Test the read() method self.assertEqual(gs_file_exists.read(), b'This is a file in the bucket root.') with self.assertRaises(Exception): gs_file_null.read() # Test write into a new file with gs_file_null('w+b') as f: f.write(b"abc") f.seek(0) self.assertEqual(f.read(), b"abc") # File will be uploaded to bucket after closed. # Test reading from the bucket self.assertEqual(gs_file_null.read(), b"abc") gs_file_null.delete()