Exemplos de BZ2File em Python, exemplos de bz2file.BZ2File em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: byte_aligned.py Projeto: lowth01m/FlightDataUtilities

def main():
    print 'FlightDataInspector (c) Copyright 2013 Flight Data Services, Ltd.'
    print '  - Powered by POLARIS'
    print '  - http://www.flightdatacommunity.com'
    print ''

    parser = argparse.ArgumentParser()

    parser.add_argument('file_path')
    parser.add_argument('--words', action='store', default=16384, type=int,
                        help='Number of words to read from the file.')
    parser.add_argument('--debug', action='store_true',
                        help='Enable debug logging.')
    parser.add_argument('--check-sync', action='store_true',
                        help='Check sync in the whole data.')

    args = parser.parse_args()

    if args.debug:
        logger.setLevel(logging.DEBUG)

    if os.path.splitext(args.file_path)[1].lower() == '.bz2':
        file_obj = bz2.BZ2File(args.file_path)
    else:
        file_obj = open(args.file_path, 'rb')

    res = inspect(file_obj, args.words)

    if res and args.check_sync:
        wps, word_index, pattern_name = res
        file_obj.seek(0)
        check_sync(file_obj, wps, word_index, pattern_name)

    file_obj.close()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: compress_bzip2.py Projeto: xabiugarte/memscrimper

def decompress(source, target):
    logging.debug("Starting decompression of %s to %s", repr(source),
                  repr(target))
    with open(source, "rb") as fsource:
        logging.debug("Parsing header")
        magic, method, majorversion, minorversion, pagesize, uncompressed_size = util.parse_header(
            fsource)
        logging.debug("    Magic number: %s", repr(magic))
        logging.debug("    Method: %s", repr(method))
        logging.debug("    Major version number: %d", majorversion)
        logging.debug("    Minor version number: %d", minorversion)
        logging.debug("    Page size: %d", pagesize)
        logging.debug("    Uncompressed size: %d", uncompressed_size)
        with open(target, "wb") as ftarget:
            curr_size = 0.0
            pagecnt = 0
            with bz2file.BZ2File(filename=fsource, mode="rb",
                                 compresslevel=9) as fsource:
                while True:
                    if pagecnt % 100 == 0 or curr_size == uncompressed_size:
                        sys.stdout.write("\rProgress: {:.2f}%".format(
                            curr_size / uncompressed_size * 100))
                        sys.stdout.flush()
                    page = fsource.read(pagesize)
                    if not page:
                        break
                    ftarget.write(page)
                    curr_size += len(page)
                    pagecnt += 1
            sys.stdout.write("\n")
    logging.debug("Done")

Exemplo n.º 3

0

Exibir arquivo

def get_fileobj(filename,
                mode="r",
                gzip_only=False,
                bz2_only=False,
                zip_only=False):
    """
    Returns a fileobj. If the file is compressed, return appropriate file reader.

    :param filename: path to file that should be opened
    :param mode: mode to pass to opener
    :param gzip_only: only open file if file is gzip compressed or not compressed
    :param bz2_only: only open file if file is bz2 compressed or not compressed
    :param zip_only: only open file if file is zip compressed or not compressed
    """
    # the various compression readers don't support 'U' mode,
    # so we open in 'r'.
    if mode == 'U':
        cmode = 'r'
    else:
        cmode = mode
    if not bz2_only and not zip_only and is_gzip(filename):
        return gzip.GzipFile(filename, cmode)
    if not gzip_only and not zip_only and is_bz2(filename):
        return bz2.BZ2File(filename, cmode)
    if not bz2_only and not gzip_only and zipfile.is_zipfile(filename):
        # Return fileobj for the first file in a zip file.
        with zipfile.ZipFile(filename, cmode) as zh:
            return zh.open(zh.namelist()[0], cmode)
    return open(filename, mode)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: signature.py Projeto: lgautier/sourmash

def _guess_open(filename):
    """
    Make a best-effort guess as to how to parse the given sequence file.

    Handles '-' as shortcut for stdin.
    Deals with .gz and .bz2 as well as plain text.
    """
    magic_dict = {
        b"\x1f\x8b\x08": "gz",
        b"\x42\x5a\x68": "bz2",
    }  # Inspired by http://stackoverflow.com/a/13044946/1585509

    if filename == '-':
        filename = '/dev/stdin'

    bufferedfile = io.open(file=filename, mode='rb', buffering=8192)
    num_bytes_to_peek = max(len(x) for x in magic_dict)
    file_start = bufferedfile.peek(num_bytes_to_peek)
    compression = None
    for magic, ftype in magic_dict.items():
        if file_start.startswith(magic):
            compression = ftype
            break
    if compression is 'bz2':
        sigfile = bz2file.BZ2File(filename=bufferedfile)
    elif compression is 'gz':
        if not bufferedfile.seekable():
            bufferedfile.close()
            raise ValueError("gziped data not streamable, pipe through zcat \
                            first")
        sigfile = gzip.GzipFile(filename=filename)
    else:
        sigfile = bufferedfile

    return sigfile

Exemplo n.º 5

0

Exibir arquivo

Arquivo: download.py Projeto: ychalier/flont

def populate_database(database_filename, dump_filename):
    """Step 5.
    Read and parse the downloaded file, and every time an article is
    encountered, we insert it in the database.
    """
    logging.info("Populating database (there are ca. 4M pages)...")
    connection = sqlite3.connect(database_filename)
    cursor = connection.cursor()
    with bz2file.BZ2File(dump_filename) as xml_file:
        parser = xml.etree.ElementTree.iterparse(xml_file)
        pbar = tqdm.tqdm(unit="page")
        for event, element in parser:
            if event == "end" and element.tag == NS + "page":
                pbar.update(1)
                if element.find(NS + "ns").text != "0":
                    element.clear()
                    continue
                title = element.find(NS + "title").text
                content = element.find(NS + "revision").find(NS + "text").text
                if "== {{langue|fr}} ==" not in content:
                    element.clear()
                    continue
                clean_content = clear_article_content(content)
                cursor.execute(
                    """INSERT INTO entries (title, content) VALUES (?, ?)""",
                    (title, clean_content))
                element.clear()
        pbar.close()
    logging.info("Commiting database insertions...")
    connection.commit()
    connection.close()

Exemplo n.º 6

0

Exibir arquivo

    def parseWikipedia(self, inPath, outPath, titlesPath):
        assert inPath != outPath

        self.loadTitles(titlesPath)

        self.outFile = None
        if outPath:
            self.outFile = codecs.open(outPath, "wt", "utf-8")

        compressed = inPath.endswith(".bz2")
        originalFile = open(inPath, "r" if compressed else "rt")
        if inPath.endswith(".bz2"):
            f = bz2file.BZ2File(originalFile, mode="r")
        else:
            f = originalFile

        lineNum = 0
        c = codecs.iterdecode(f, "utf-8")
        for line in c:
            if lineNum % 100000 == 0:
                print "Processing line", lineNum, "title", (
                    self.numTitles, self.numSkipped), "=", self.title
            self.processLine(line)
            lineNum += 1

        originalFile.close()
        if self.outFile:
            self.outFile.close()

Exemplo n.º 7

0

Exibir arquivo

def multi_open(name):
    if name.endswith('.gz'):
        f = gzip.open(name)
    elif name.endswith('.bz2'):
        f = bz2.BZ2File(name)
    else:
        f = open(name)
    return f

Exemplo n.º 8

0

Exibir arquivo

Arquivo: parser_mp.py Projeto: kblissett/wiki-dump-parser

def get_index(path):
    res = set()
    for line in bz2file.BZ2File(path):
        m = re.search(('(\d+)\:\d+:.+'), line)
        res.add(int(m.group(1)))
    res = list(sorted(res, key=int))
    res.append(-1)
    return res

Exemplo n.º 9

0

Exibir arquivo

Arquivo: openscreed.py Projeto: chelseaju/TahcoRoll

    def open_reader(self, filename, *args, **kwargs):
        """
        Make a best-effort guess as to how to parse the given sequence file.

        Handles '-' as shortcut for stdin.
        Deals with .gz, FASTA, and FASTQ records.
        """
        magic_dict = {
            b"\x1f\x8b\x08": "gz",
            b"\x42\x5a\x68": "bz2",
            # "\x50\x4b\x03\x04": "zip"
        }  # Inspired by http://stackoverflow.com/a/13044946/1585509
        filename = _normalize_filename(filename)
        bufferedfile = io.open(file=filename, mode='rb', buffering=8192)
        num_bytes_to_peek = max(len(x) for x in magic_dict)
        file_start = bufferedfile.peek(num_bytes_to_peek)
        compression = None
        for magic, ftype in magic_dict.items():
            if file_start.startswith(magic):
                compression = ftype
                break
        if compression is 'bz2':
            sequencefile = bz2file.BZ2File(filename=bufferedfile)
            peek = sequencefile.peek(1)
        elif compression is 'gz':
            if not bufferedfile.seekable():
                bufferedfile.close()
                raise ValueError(
                    "gziped data not streamable, pipe through zcat \
                                first")
            peek = gzip.GzipFile(filename=filename).read(1)
            sequencefile = gzip.GzipFile(filename=filename)
        else:
            peek = bufferedfile.peek(1)
            sequencefile = bufferedfile

        iter_fn = None
        try:
            first_char = peek[0]
        except IndexError as err:
            return []  # empty file

        try:
            first_char = chr(first_char)
        except TypeError:
            pass

        if first_char == '>':
            iter_fn = fasta_iter
        elif first_char == '@':
            iter_fn = fastq_iter

        if iter_fn is None:
            raise ValueError("unknown file format for '%s'" % filename)

        self.sequencefile = sequencefile
        return iter_fn(sequencefile, *args, **kwargs)

Exemplo n.º 10

0

Exibir arquivo

def _open_bz2(filename, mode):
    if bz2 is None:
        raise ImportError("Cannot open bz2 files: The bz2 module is not available")
    if _PY3:
        return bz2.open(filename, mode)
    else:
        if mode[0] == 'a':
            raise ValueError("Mode '{}' not supported with BZ2 compression".format(mode))
        return bz2.BZ2File(filename, mode)

Exemplo n.º 11

0

Exibir arquivo

def xml_to_csv(filename):
    # Construct dump file iterator
    input_file = Dump.from_file(bz2file.BZ2File(filename))

    print("Processing...")
    # Open output file
    output_csv = open(filename[0:-3] + "2csv", 'w')

    # writing header for output csv file
    output_csv.write(";".join([
        "page_id", "page_title", "page_ns", "revision_id", "revision_parent",
        "timestamp", "contributor_id", "contributor_name", "comments", "model"
        "bytes"
    ]))
    output_csv.write("\n")
    # Iterate through pages
    par = tqdm.tqdm()
    for page in input_file.pages:
        par.update(1)
        # get page info
        page_id = str(page.id)
        page_title = '|{}|'.format(page.title)
        page_ns = str(page.namespace)
        if page_id == '12':
            for revision in page:
                if revision != None:
                    # get revision info
                    revision_id = str(revision.id)
                    if revision_id == '876580929':
                        text = str(revision.text)
                        revision_parent = '-1' if revision.parent_id == None else str(
                            revision.parent_id)
                        timestamp = str(revision.timestamp)
                        revision_bytes = '-1' if revision.bytes == None else str(
                            revision.bytes)

                        contributor_id = str(revision.user.id)
                        contributor_name = str(revision.user.text)

                        comment = str(revision.comment)
                        model = str(revision.model)

                        revision_row = [
                            page_id, page_title, page_ns, revision_id,
                            revision_parent, timestamp, contributor_id,
                            contributor_name, comment, model, revision_bytes,
                            text
                        ]
                        #~ print(revision_row)
                        output_csv.write(";".join(revision_row) + '\n')
                        return

    print("Done processing")
    output_csv.close()
    return True

Exemplo n.º 12

0

Exibir arquivo

    def __init__(self, input_file):
        self.filename = input_file
        self.indexed = False

        if input_file.strip() == "-":
            ifile = sys.stdin
        elif input_file.endswith(".bz2"):
            try:
                ifile = bz2file.BZ2File(input_file, "r", buffering=0)
            except Exception, e:
                raise e

Exemplo n.º 13

0

Exibir arquivo

def extract_bzip2(archive, compression, cmd, verbosity, interactive, outdir):
    """Extract a BZIP2 archive with the bz2 Python module."""
    targetname = util.get_single_outfile(outdir, archive)
    try:
        with bz2.BZ2File(archive) as bz2file:
            with open(targetname, 'wb') as targetfile:
                data = bz2file.read(READ_SIZE_BYTES)
                while data:
                    targetfile.write(data)
                    data = bz2file.read(READ_SIZE_BYTES)
    except Exception as err:
        msg = "error extracting %s to %s: %s" % (archive, targetname, err)
        raise util.PatoolError(msg)
    return None

Exemplo n.º 14

0

Exibir arquivo

Arquivo: download_tweets.py Projeto: digitalepidemiologylab/vaccine_sentiment_england

def get_open(path, mode, file_type=None, encoding='utf-8'):
    def wrapper(opener):
        if 'r' in mode:
            return io.TextIOWrapper(io.BufferedReader(opener), encoding=encoding)
        else:
            return io.TextIOWrapper(opener, encoding=encoding)

    if file_type == 'gzip':
        return wrapper(gzip.GzipFile(path, mode))
    if file_type == 'bz2':
        import bz2file
        return wrapper(bz2file.BZ2File(path, mode))
    else:
        return io.open(path, mode, encoding=encoding)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_conversion.py Projeto: akifoss/dark-matter

 def testSaveAsJSONBzip2(self):
     """
     A DiamondTabularFormatReader must be able to save itself as bzip2'd
     JSON.
     """
     mockOpener = mockOpen(read_data=DIAMOND_RECORDS)
     with patch.object(builtins, 'open', mockOpener):
         reader = DiamondTabularFormatReader('file.txt')
         data = BytesIO()
         fp = bz2file.BZ2File(data, 'w')
         reader.saveAsJSON(fp, writeBytes=True)
         fp.close()
         self.assertEqual(compress(DIAMOND_RECORDS_DUMPED.encode('UTF-8')),
                          data.getvalue())

Exemplo n.º 16

0

Exibir arquivo

Arquivo: feature_reader.py Projeto: nikolausn/htrc-feature-reader

    def _read_json(self, path_or_url, compressed=True, advanced_path=False):
        ''' Load JSON for a path. Allows remote files in addition to local ones. '''
        if parse_url(path_or_url).scheme in ['http', 'https']:
            try:
                req = _urlopen(path_or_url)
                filename_or_buffer = BytesIO(req.read())
            except HTTPError:
                logging.exception("HTTP Error accessing %s" % path_or_url)
                raise
            compressed = False
        else:
            filename_or_buffer = path_or_url

        try:
            if compressed:
                f = bz2.BZ2File(filename_or_buffer)
            else:
                if (type(filename_or_buffer) != BytesIO) and not isinstance(
                        filename_or_buffer, StringIO):
                    f = codecs.open(filename_or_buffer, 'r+', encoding="utf-8")
                else:
                    f = filename_or_buffer
            rawjson = f.readline()
            f.close()
        except IOError:
            logging.exception(
                "Can't read %s. Did you pass the incorrect "
                "'compressed=' argument?", path_or_url)
            raise
        except:
            print(compressed, type(filename_or_buffer))
            logging.exception("Can't open %s", path_or_url)
            raise

        # This is a bandaid for schema version 2.0, not over-engineered
        # since upcoming releases of the extracted features
        # dataset won't keep the basic/advanced split

        try:
            # For Python3 compatibility, decode to str object
            if PY3 and (type(rawjson) != str):
                rawjson = rawjson.decode()
            volumejson = json.loads(rawjson)
        except:
            logging.exception(
                "Problem reading JSON for %s. One common reason"
                " for this error is an incorrect compressed= "
                "argument", path_or_url)
            raise
        return volumejson

Exemplo n.º 17

0

Exibir arquivo

Arquivo: wiki.py Projeto: ucaslyc/tensor2tensor

def page_generator(tmp_dir, max_docs=None):
  doc = u""
  count = 0
  corpus_filepath = _maybe_download_corpus(tmp_dir)
  for line in bz2file.BZ2File(corpus_filepath, "r", buffering=1000000):
    line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8")
    if not doc and line != u"  <page>\n":
      continue
    doc += line
    if line == u"  </page>\n":
      yield doc
      doc = u""
      count += 1
      if max_docs and count >= max_docs:
        break

Exemplo n.º 18

0

Exibir arquivo

def _unpack_zip(zipfile, all_tasks):
    fpath = '%s%s%s' % (settings.ZIP_DIR, os.sep, zipfile)
    try:
        b = bz2file.BZ2File(fpath)
        tar = tarfile.open(fileobj=b)
    except tarfile.ReadError:
        error("Could not read tarfile: %s" % fpath)
        return
    mkdir(settings.STAGE_DIR)
    tar.extractall(settings.STAGE_DIR)
    tar.close()
    move_results(all_tasks)
    ziplog = settings.ZIP_DIR + os.sep + 'abed_unzipped.txt'
    with open(ziplog, 'a') as fid:
        fid.write(zipfile + '\n')

Exemplo n.º 19

0

Exibir arquivo

def create_bzip2(archive, compression, cmd, verbosity, interactive, filenames):
    """Create a BZIP2 archive with the bz2 Python module."""
    if len(filenames) > 1:
        raise util.PatoolError(
            'multi-file compression not supported in Python bz2')
    try:
        with bz2.BZ2File(archive, 'wb') as bz2file:
            filename = filenames[0]
            with open(filename, 'rb') as srcfile:
                data = srcfile.read(READ_SIZE_BYTES)
                while data:
                    bz2file.write(data)
                    data = srcfile.read(READ_SIZE_BYTES)
    except Exception as err:
        msg = "error creating %s: %s" % (archive, err)
        raise util.PatoolError(msg)
    return None

Exemplo n.º 20

0

Exibir arquivo

Arquivo: compress_bzip2.py Projeto: xabiugarte/memscrimper

def compress(source, target, pagesize=4096):
    logging.debug("Starting compression of %s to %s", repr(source),
                  repr(target))
    logging.debug("Page size: %d", pagesize)
    size = os.path.getsize(source)
    with open(target, "wb") as ftarget:
        ftarget.write(util.create_header("bzip2", size))
        with bz2file.BZ2File(filename=ftarget, mode="wb",
                             compresslevel=9) as ftarget:
            for i, page in enumerate(util.get_pages(source,
                                                    pagesize=pagesize)):
                if i % 100 == 0 or (i + 1) * pagesize == size:
                    sys.stdout.write("\rProgress: {:.2f}%".format(
                        float(i * pagesize) / size * 100))
                    sys.stdout.flush()
                ftarget.write(page)
    sys.stdout.write("\n")
    logging.debug("Done")

Exemplo n.º 21

0

Exibir arquivo

def check_bz2(file_path, check_content=True):
    try:
        with open(file_path, "rb") as temp:
            magic_check = temp.read(3)
        if magic_check != util.bz2_magic:
            return (False, False)
    except Exception:
        return (False, False)

    if not check_content:
        return (True, True)

    with bz2.BZ2File(file_path, mode='rb') as bzipped_file:
        chunk = bzipped_file.read(CHUNK_SIZE)
    # See if we have a compressed HTML file
    if check_html(chunk, file_path=False):
        return (True, False)
    return (True, True)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: commit_util.py Projeto: scrathat/galaxy

def handle_bz2(repository, uploaded_file_name):
    fd, uncompressed = tempfile.mkstemp(prefix='repo_%d_upload_bunzip2_' % repository.id,
                                        dir=os.path.dirname(uploaded_file_name),
                                        text=False)
    bzipped_file = bz2.BZ2File(uploaded_file_name, 'rb')
    while 1:
        try:
            chunk = bzipped_file.read(basic_util.CHUNK_SIZE)
        except IOError:
            os.close(fd)
            os.remove(uncompressed)
            log.exception('Problem uncompressing bz2 data "%s"', uploaded_file_name)
            return
        if not chunk:
            break
        os.write(fd, chunk)
    os.close(fd)
    bzipped_file.close()
    shutil.move(uncompressed, uploaded_file_name)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: filter_trim.py Projeto: merckey/PDX-Analysis-Workflows

    def open(name):
        """
        Intended to be private to the class...

        A flexible open routine that can handle plain text files or
        files compressed with gzip or bzip2.  Only used for the
        input files. Output files are emitted uncompressed, until the
        tools in the next leg of the pipeline can work properly with
        compressed files.

        :param name: The filename to open.
        :return: A file object for the named file.
        """
        if name.endswith('.gz'):
            f = gzip.open(name)
        elif name.endswith('.bz2'):
            f = bz2.BZ2File(name)
        else:
            f = open(name)
        return f

Exemplo n.º 24

0

Exibir arquivo

Arquivo: checkers.py Projeto: MADscientist314/galaxy-lib

def check_bz2(file_path, check_content=True):
    try:
        temp = open(file_path, "U")
        magic_check = temp.read(3)
        temp.close()
        if magic_check != util.bz2_magic:
            return (False, False)
    except:
        return (False, False)

    if not check_content:
        return (True, True)

    CHUNK_SIZE = 2**15  # reKb
    bzipped_file = bz2.BZ2File(file_path, mode='rb')
    chunk = bzipped_file.read(CHUNK_SIZE)
    bzipped_file.close()
    # See if we have a compressed HTML file
    if check_html(file_path, chunk=chunk):
        return (True, False)
    return (True, True)

Exemplo n.º 25

0

Exibir arquivo

def load_channels(sample, chr_list):

    prefix = ''
    channel_names = [
        'clipped_reads', 'clipped_read_distance', 'coverage',
        'split_read_distance'
    ]

    channel_data = defaultdict(dict)
    for chrom in chr_list:
        logging.info('Loading data for Chr%s' % chrom)
        for ch in channel_names:
            logging.info('Loading data for channel %s' % ch)
            suffix = '.npy.bz2' if ch == 'coverage' else '.pbz2'
            if HPC_MODE:
                filename = "/hpc/cog_bioinf/ridder/users/smehrem/breakpoint-pairs/NA12878_channel_data/" + ch + "/" + chrom + "_" + ch + suffix
            else:
                filename = "/home/cog/smehrem/MinorResearchInternship/NA12878/" + ch + "/" + '_'.join(
                    [chrom, ch + suffix])
            assert os.path.isfile(filename)

            logging.info('Reading %s for Chr%s' % (ch, chrom))
            with bz2file.BZ2File(filename, 'rb') as f:
                if suffix == '.npy.bz2':
                    channel_data[chrom][ch] = np.load(f)
                else:
                    channel_data[chrom][ch] = pickle.load(f)
            logging.info('End of reading')

        # unpack clipped_reads
        channel_data[chrom]['read_quality'], channel_data[chrom]['clipped_reads'], \
        channel_data[chrom]['clipped_reads_inversion'], channel_data[chrom]['clipped_reads_duplication'], \
        channel_data[chrom]['clipped_reads_translocation'] = channel_data[chrom]['clipped_reads']

        # unpack split_reads
        channel_data[chrom]['split_read_distance'], \
        channel_data[chrom]['split_reads'] = channel_data[chrom]['split_read_distance']

    return channel_data

Exemplo n.º 26

0

Exibir arquivo

def open_raw_data(filepath, binary=True):
    '''
    Open the input file which may be compressed.

    :param filepath: Path of raw data file which can either be zip, bz2 or uncompressed.
    :type filepath: str

    :returns: An opened file object.
    :rtype: file
    '''
    extension = os.path.splitext(filepath)[1].lower()

    if extension in {'.sac', '.zip'}:
        zf = zipfile.ZipFile(filepath, 'r')
        filenames = zf.namelist()
        if len(filenames) != 1:
            raise IOError('Zip files must contain only a single data file.')
        return zf.open(filenames[0])

    if extension in {'.bz2'}:
        return bz2.BZ2File(filepath, 'r')

    return open(filepath, 'rb' if binary else 'r')

Exemplo n.º 27

0

Exibir arquivo

def page_generator(tmp_dir, max_docs=None):
    """
  Generate cleaned wikipedia articles as a string.
  """
    doc = u""
    count = 0
    corpus_filepath = _maybe_download_corpus(tmp_dir)
    for line in bz2file.BZ2File(corpus_filepath, "r", buffering=1000000):
        line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8")
        if not doc and line != u"  <page>\n":
            continue
        doc += line
        if line == u"  </page>\n":
            doc_text = _page_text(doc)
            if doc_text != None:
                parsed_text = mwparserfromhell.parse(doc_text) \
                  .strip_code(normalize=True, collapse=True)
                yield parsed_text

            doc = u""
            count += 1
            if max_docs and count >= max_docs:
                break

Exemplo n.º 28

0

Exibir arquivo

Arquivo: compression_utils.py Projeto: yiming-kang/galaxy

def get_fileobj(filename, mode="r", compressed_formats=None):
    """
    Returns a fileobj. If the file is compressed, return an appropriate file
    reader. In text mode, always use 'utf-8' encoding.

    :param filename: path to file that should be opened
    :param mode: mode to pass to opener
    :param compressed_formats: list of allowed compressed file formats among
      'bz2', 'gzip' and 'zip'. If left to None, all 3 formats are allowed
    """
    if compressed_formats is None:
        compressed_formats = ['bz2', 'gzip', 'zip']
    # Remove 't' from mode, which may cause an error for compressed files
    mode = mode.replace('t', '')
    # the various compression readers don't support 'U' mode,
    # so we open in 'r'.
    if mode == 'U':
        cmode = 'r'
    else:
        cmode = mode
    if 'gzip' in compressed_formats and is_gzip(filename):
        fh = gzip.GzipFile(filename, cmode)
    elif 'bz2' in compressed_formats and is_bz2(filename):
        fh = bz2.BZ2File(filename, cmode)
    elif 'zip' in compressed_formats and zipfile.is_zipfile(filename):
        # Return fileobj for the first file in a zip file.
        with zipfile.ZipFile(filename, cmode) as zh:
            fh = zh.open(zh.namelist()[0], cmode)
    elif 'b' in mode:
        return open(filename, mode)
    else:
        return io.open(filename, mode, encoding='utf-8')
    if 'b' not in mode:
        return io.TextIOWrapper(fh, encoding='utf-8')
    else:
        return fh

Exemplo n.º 29

0

Exibir arquivo

def inspect_pairs(candidate_pairs, outFile):

    final_pairs = set()

    # from bp1 point of view
    bp_dict = defaultdict(dict)
    bp_list = []
    for sv in candidate_pairs:
        bp1, bp2 = sv.tuple

        bp_id = bp1.id()
        bp2_id = '_'.join([bp2.chr, bp2.strand])
        if bp2_id not in bp_dict[bp_id]:
            bp_dict[bp_id] = defaultdict(list)
        bp_dict[bp_id][bp2_id].append(bp2.pos)
        bp_list.append(bp_id)

    bp_cnt = Counter(bp_list)
    min_support_bp = [k for (k, v) in bp_cnt.items() if v >= min_support]
    logging.info('Min %d supported positions bp1: %d/%d' %
                 (min_support, len(min_support_bp), len(bp_cnt)))
    for bp1_id in min_support_bp:
        bp1_chr, bp1_pos, bp1_strand = bp1_id.split('_')
        for bp2_id in bp_dict[bp1_id]:
            bp2_chr, bp2_strand = bp2_id.split('_')
            if len(bp_dict[bp1_id][bp2_id]) >= min_support:
                bp2_pos = max(
                    bp_dict[bp1_id][bp2_id]) if bp1_strand == '+' else min(
                        bp_dict[bp1_id][bp2_id])
                final_pairs.add(
                    StructuralVariant(
                        Breakpoint(bp1_chr, int(bp1_pos), bp1_strand),
                        Breakpoint(bp2_chr, int(bp2_pos), bp2_strand)))

    logging.info('Length of pair set after BP1 perspective: %d' %
                 len(final_pairs))

    # from bp2 point of view
    bp_dict = defaultdict(dict)
    bp_list = []
    for sv in candidate_pairs:
        bp1, bp2 = sv.tuple

        bp_id = bp2.id()
        bp1_id = '_'.join([bp1.chr, bp1.strand])
        if bp1_id not in bp_dict[bp_id]:
            bp_dict[bp_id] = defaultdict(list)
        bp_dict[bp_id][bp1_id].append(bp1.pos)
        bp_list.append(bp_id)

    bp_cnt = Counter(bp_list)
    min_support_bp = [k for (k, v) in bp_cnt.items() if v >= min_support]
    logging.info('Min %d supported positions bp2: %d/%d' %
                 (min_support, len(min_support_bp), len(bp_cnt)))
    for bp1_id in min_support_bp:
        bp1_chr, bp1_pos, bp1_strand = bp1_id.split('_')
        for bp2_id in bp_dict[bp1_id]:
            bp2_chr, bp2_strand = bp2_id.split('_')
            if len(bp_dict[bp1_id][bp2_id]) >= min_support:
                bp2_pos = max(
                    bp_dict[bp1_id][bp2_id]) if bp1_strand == '+' else min(
                        bp_dict[bp1_id][bp2_id])
                final_pairs.add(
                    StructuralVariant(
                        Breakpoint(bp1_chr, int(bp1_pos), bp1_strand),
                        Breakpoint(bp2_chr, int(bp2_pos), bp2_strand)))

    logging.info('Length of pair set after BP2 perspective: %d' %
                 len(final_pairs))

    # Write the output in pickle format
    with bz2file.BZ2File(outFile, 'wb') as f:
        pickle.dump(final_pairs, f)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: compress_interdedup.py Projeto: mbrengel/memscrimper

def compress(source, target, reference, nointra, delta, inner, pagesize=4096):
    # some info
    logging.debug("Starting compression of %s to %s", repr(source),
                  repr(target))
    logging.debug("Page size: %d", pagesize)
    logging.debug("Reference dump: %s", reference)

    # pages + page numbers bookkeeping
    reference_pages, reference_pagenrs = [], {}
    for i, page in enumerate(util.get_pages(reference)):
        reference_pages.append(page)
        if page not in reference_pagenrs:
            reference_pagenrs[page] = i
    reference_pages_set = set(reference_pages)

    # find new + duplicatable pages
    dedups = dd(list)
    diffs = dd()
    diff_seen = set()
    if nointra:
        new_pagenrs = []
    else:
        new_pagenrs = dd(list)
    new_pages = []
    same_distinct, same_total = set(), 0
    source_pages = []
    for i, page in enumerate(util.get_pages(source)):
        source_pages.append(page)
        if reference_pages[i] != page:
            if page not in reference_pages_set:
                if delta is not None:
                    d = util.create_diff(reference_pages[i], page)
                    if d is not None:
                        diff_seen.add(page)
                        diffs[i] = d
                        continue
                if nointra:
                    new_pagenrs.append(i)
                else:
                    new_pagenrs[page].append(i)
                new_pages.append(page)
            else:
                dedups[page].append(i)
        else:
            same_total += 1
            same_distinct.add(page)
    source_pages_set = set(source_pages)
    newpagescnt = len(new_pages), len(set(new_pages))

    # intervalize
    if nointra:
        new_pagenrs = util.intervalize(new_pagenrs)
    else:
        new_pagenrs = {
            page: util.intervalize(new_pagenrs[page])
            for page in new_pagenrs
        }
    dedups = {page: util.intervalize(dedups[page]) for page in dedups}

    # write file
    util.create_dir(".tmp")
    tmphandle, tmpfile = tempfile.mkstemp(dir=".tmp")
    try:
        with open(tmpfile, "wb") as ftmp:
            ftmp.write(reference + "\x00")
            inorder = []
            seen = set()
            for page in reference_pages:
                if page in dedups and page not in seen:
                    inorder.append(page)
                    seen.add(page)
            util.create_pagenr_list(
                [reference_pagenrs[page] for page in inorder], ftmp)
            for page in inorder:
                ftmp.write(util.create_interval_list(dedups[page]))
            if delta is not None:
                util.create_pagenr_list(sorted(diffs), ftmp)
                for pagenr in sorted(diffs):
                    ftmp.write(diffs[pagenr])
            if nointra:
                ftmp.write(util.create_interval_list(new_pagenrs))
                for page in new_pages:
                    ftmp.write(page)
            else:
                ftmp.write(struct.pack("<I", len(new_pagenrs)))
                for page in new_pagenrs:
                    ftmp.write(util.create_interval_list(new_pagenrs[page]))
                for page in new_pagenrs:
                    ftmp.write(page)
        with open(tmpfile, "rb") as ftmp, open(target, "wb") as ftarget:
            ftarget.write(
                util.create_header(create_method_name(nointra, delta, inner),
                                   os.path.getsize(source)))
            ftarget.flush()
            if inner is None:
                shutil.copyfileobj(ftmp, ftarget)
            elif inner == "gzip":
                with gzip.GzipFile(fileobj=ftarget, mode="wb",
                                   compresslevel=9) as ftarget:
                    shutil.copyfileobj(ftmp, ftarget)
            elif inner == "bzip2":
                with bz2file.BZ2File(filename=ftarget,
                                     mode="wb",
                                     compresslevel=9) as ftarget:
                    shutil.copyfileobj(ftmp, ftarget)
            elif inner == "7zip":
                p = subprocess.Popen(
                    ["7za", "a", "-an", "-txz", "-mx=9", "-si", "-so", source],
                    stdin=ftmp,
                    stdout=ftarget,
                    stderr=subprocess.PIPE)
                p.communicate()
    finally:
        os.close(tmphandle)
        os.remove(tmpfile)

    # some info
    dedup_distinct = len(set(dedups.keys()) | same_distinct)
    dedup_total = same_total + sum(b - a + 1 for l in dedups.values()
                                   for a, b in l)
    logging.debug("Deduplicated pages at the same offset: %d/%d (%d/%d)",
                  same_total, len(source_pages), len(same_distinct),
                  len(source_pages_set))
    logging.debug("Deduplicated pages at different offsets: %d/%d (%d/%d)",
                  dedup_total - same_total, len(source_pages), len(dedups),
                  len(source_pages_set))
    logging.debug("Deduplicated pages in total: %d/%d (%d/%d)", dedup_total,
                  len(source_pages), dedup_distinct, len(source_pages_set))
    if delta is not None:
        logging.debug("Diffed pages: %d/%d (%d/%d)", len(diffs),
                      len(source_pages), len(diff_seen), len(source_pages_set))
    logging.debug("New pages: %d/%d (%d/%d)", newpagescnt[0],
                  len(source_pages), newpagescnt[1], len(source_pages_set))
    logging.debug("Done")

    return 0