示例#1
0
def _analyze_barcode(gzip_fastq, json_stats, logger):
    logger.debug(f"Analyzing barcode in {gzip_fastq}")

    logger.debug("Counting reads by barcode...")

    # fastq = File.unzip(gzip_fastq)
    fastq = gzip_fastq.replace(".gz", "")

    with gzip.open(gzip_fastq, 'rb') as gzip_file:
        with open(fastq, "wb") as unzipped_file:
            logger.debug("Unzipping %s to %s ..." % (gzip_fastq, fastq))
            block_size = 1 << 20
            while True:
                block = gzip_file.read(block_size)
                if not block:
                    break
                unzipped_file.write(block)

    barcode_stats = IlluminaFASTQ(fastq).count_by_barcode()

    logger.debug(f"Barcode count: {len(barcode_stats.keys())}")

    with StorageFile.init(json_stats, 'w') as fp:
        json.dump(barcode_stats, fp)

    return json_stats
示例#2
0
 def test_text_read(self):
     with StorageFile.init(
             os.path.join(self.TEST_ROOT, "file_in_test_folder")) as f:
         self.assertEqual(f.size, 0)
         self.assertEqual(f.tell(), 0)
         self.assertEqual(f.seek(0, 2), 0)
         self.assertEqual(len(f.read()), 0)
示例#3
0
 def test_gs_read_seek(self):
     # GSFile instance
     with StorageFile.init("gs://aries_test/file_in_root.txt") as gs_file:
         self.assertEqual(gs_file.scheme, "gs")
         # self.assertEqual(str(type(gs_file).__name__), "GSFile")
         self.assertTrue(gs_file.seekable())
         self.assertTrue(gs_file.readable())
         self.assertEqual(gs_file.size, 34)
示例#4
0
文件: source.py 项目: qiuosier/Virgo
 def __save_data_frame(self, df, symbol, series_type):
     if df.empty:
         logger.info("Data frame is empty.")
         return None
     file_path = self.__cache_file_path(symbol, series_type)
     logger.debug("Saving %s rows to... %s" % (len(df), file_path))
     with StorageFile.init(file_path, 'w') as f:
         df.to_csv(f)
     return file_path
示例#5
0
 def test_text_read_write(self):
     # Write a new file
     temp_file_path = os.path.join(self.TEST_ROOT, "temp_file.txt")
     with StorageFile.init(temp_file_path, 'w+') as f:
         self.assertTrue(f.writable())
         self.assertEqual(f.tell(), 0)
         self.assertEqual(f.write("abc"), 3)
         self.assertEqual(f.tell(), 3)
         f.seek(0)
         self.assertEqual(f.read(), "abc")
         # TODO: File may not exist on the cloud until it is closed.
         # self.assertTrue(f.exists())
     f.delete()
示例#6
0
文件: source.py 项目: qiuosier/Virgo
    def __intraday_get_full_data(self, symbol):
        """Gets the most recent intraday data (which may include data of multiple days.)

        Args:
            symbol (str): The symbol of the equity/stock.

        Returns: A pandas data frame of intraday series data.

        """
        series_type = self.intraday_series_type
        cached_file = self.__intraday_valid_cache(symbol)
        if cached_file:
            logger.debug("Reading cached file: %s" % cached_file.uri)
            with cached_file('r') as f:
                df = pd.read_csv(f, index_col=0, parse_dates=['timestamp'])
            return df
        df = self.__request_data(symbol, series_type, 'full', interval="1min")
        file_path = os.path.join(self.cache, self.__intraday_cache_file_prefix(symbol)) \
            + datetime.datetime.now().strftime(self.intraday_time_fmt)
        logger.debug("Saving intraday data...")
        with StorageFile.init(file_path, 'w') as f:
            df.to_csv(f)
        # Group data by date
        groups = df.groupby(df['timestamp'].dt.normalize())
        # Get the latest date in the data frame
        dates = [str(name).split(" ")[0] for name, _ in groups]
        latest = max(dates)
        for name, group in groups:
            date = str(name).split(" ")[0]
            # The data for a date is complete if there is data at 1600 or the date is not the latest one
            if not group[group.timestamp == date +
                         " 16:00:00"].empty or date < latest:
                date_file_path = self.__cache_file_path(
                    symbol, series_type, date)
                with StorageFile.init(date_file_path, 'w') as f:
                    group.reset_index(drop=True).to_csv(f)
        return df
示例#7
0
文件: vcf.py 项目: qiuosier/Cancer
 def __init__(self, uri, annotation_uri):
     super().__init__(uri)
     self.content = StorageFile.init(uri).read()
     if isinstance(self.content, bytes):
         self.content = self.content.decode()
     self.content = self.content.split("\n")
     self.headers = []
     self.variants = []
     self.annotations = self.load_annotation(annotation_uri)
     for line in self.content:
         if not line:
             continue
         if line.startswith("#"):
             self.headers.append(line)
         else:
             key = self.variant_key(line)
             self.variants.append(Variant(line, self.annotations.get(key)))
示例#8
0
    def peek_barcode(self):
        barcode_dict = {}
        with StorageFile.init(self.file_path, 'rb') as f:
            with gzip.GzipFile(fileobj=f) as gz:
                for i, line in enumerate(gz, start=1):
                    if i > 4000:
                        break
                    # The line containing barcode starts with @
                    if not line.startswith(b"@"):
                        continue
                    if isinstance(line, bytes):
                        line = line.decode()
                    # Raw barcode
                    barcode = line.strip().split(":")[-1]

                    if re.match(self.dual_index_pattern, barcode):
                        barcode = self.convert_barcode(barcode)
                        barcode_dict[barcode] = self.__count_barcode(barcode_dict, barcode, i)
        return barcode_dict
示例#9
0
 def create_file(cls, relative_path, content):
     """Creates a file relative to the test root
     """
     abs_path = os.path.join(cls.TEST_ROOT, relative_path)
     with StorageFile.init(abs_path, "w") as f:
         f.write(content)
示例#10
0
 def read_count(self):
     logger.debug("Counting reads in file %s..." % self.uri)
     self.gzip = gzip.GzipFile(
         fileobj=StorageFile.init(self.uri, "rb").local())
     return len(list(self))
示例#11
0
 def __init__(self, uri):
     self.uri = uri
     self.gzip = gzip.GzipFile(fileobj=StorageFile.init(uri, "rb"))
     self.current = 0