def main(): print 'FlightDataInspector (c) Copyright 2013 Flight Data Services, Ltd.' print ' - Powered by POLARIS' print ' - http://www.flightdatacommunity.com' print '' parser = argparse.ArgumentParser() parser.add_argument('file_path') parser.add_argument('--words', action='store', default=16384, type=int, help='Number of words to read from the file.') parser.add_argument('--debug', action='store_true', help='Enable debug logging.') parser.add_argument('--check-sync', action='store_true', help='Check sync in the whole data.') args = parser.parse_args() if args.debug: logger.setLevel(logging.DEBUG) if os.path.splitext(args.file_path)[1].lower() == '.bz2': file_obj = bz2.BZ2File(args.file_path) else: file_obj = open(args.file_path, 'rb') res = inspect(file_obj, args.words) if res and args.check_sync: wps, word_index, pattern_name = res file_obj.seek(0) check_sync(file_obj, wps, word_index, pattern_name) file_obj.close()
def decompress(source, target): logging.debug("Starting decompression of %s to %s", repr(source), repr(target)) with open(source, "rb") as fsource: logging.debug("Parsing header") magic, method, majorversion, minorversion, pagesize, uncompressed_size = util.parse_header( fsource) logging.debug(" Magic number: %s", repr(magic)) logging.debug(" Method: %s", repr(method)) logging.debug(" Major version number: %d", majorversion) logging.debug(" Minor version number: %d", minorversion) logging.debug(" Page size: %d", pagesize) logging.debug(" Uncompressed size: %d", uncompressed_size) with open(target, "wb") as ftarget: curr_size = 0.0 pagecnt = 0 with bz2file.BZ2File(filename=fsource, mode="rb", compresslevel=9) as fsource: while True: if pagecnt % 100 == 0 or curr_size == uncompressed_size: sys.stdout.write("\rProgress: {:.2f}%".format( curr_size / uncompressed_size * 100)) sys.stdout.flush() page = fsource.read(pagesize) if not page: break ftarget.write(page) curr_size += len(page) pagecnt += 1 sys.stdout.write("\n") logging.debug("Done")
def get_fileobj(filename, mode="r", gzip_only=False, bz2_only=False, zip_only=False): """ Returns a fileobj. If the file is compressed, return appropriate file reader. :param filename: path to file that should be opened :param mode: mode to pass to opener :param gzip_only: only open file if file is gzip compressed or not compressed :param bz2_only: only open file if file is bz2 compressed or not compressed :param zip_only: only open file if file is zip compressed or not compressed """ # the various compression readers don't support 'U' mode, # so we open in 'r'. if mode == 'U': cmode = 'r' else: cmode = mode if not bz2_only and not zip_only and is_gzip(filename): return gzip.GzipFile(filename, cmode) if not gzip_only and not zip_only and is_bz2(filename): return bz2.BZ2File(filename, cmode) if not bz2_only and not gzip_only and zipfile.is_zipfile(filename): # Return fileobj for the first file in a zip file. with zipfile.ZipFile(filename, cmode) as zh: return zh.open(zh.namelist()[0], cmode) return open(filename, mode)
def _guess_open(filename): """ Make a best-effort guess as to how to parse the given sequence file. Handles '-' as shortcut for stdin. Deals with .gz and .bz2 as well as plain text. """ magic_dict = { b"\x1f\x8b\x08": "gz", b"\x42\x5a\x68": "bz2", } # Inspired by http://stackoverflow.com/a/13044946/1585509 if filename == '-': filename = '/dev/stdin' bufferedfile = io.open(file=filename, mode='rb', buffering=8192) num_bytes_to_peek = max(len(x) for x in magic_dict) file_start = bufferedfile.peek(num_bytes_to_peek) compression = None for magic, ftype in magic_dict.items(): if file_start.startswith(magic): compression = ftype break if compression is 'bz2': sigfile = bz2file.BZ2File(filename=bufferedfile) elif compression is 'gz': if not bufferedfile.seekable(): bufferedfile.close() raise ValueError("gziped data not streamable, pipe through zcat \ first") sigfile = gzip.GzipFile(filename=filename) else: sigfile = bufferedfile return sigfile
def populate_database(database_filename, dump_filename): """Step 5. Read and parse the downloaded file, and every time an article is encountered, we insert it in the database. """ logging.info("Populating database (there are ca. 4M pages)...") connection = sqlite3.connect(database_filename) cursor = connection.cursor() with bz2file.BZ2File(dump_filename) as xml_file: parser = xml.etree.ElementTree.iterparse(xml_file) pbar = tqdm.tqdm(unit="page") for event, element in parser: if event == "end" and element.tag == NS + "page": pbar.update(1) if element.find(NS + "ns").text != "0": element.clear() continue title = element.find(NS + "title").text content = element.find(NS + "revision").find(NS + "text").text if "== {{langue|fr}} ==" not in content: element.clear() continue clean_content = clear_article_content(content) cursor.execute( """INSERT INTO entries (title, content) VALUES (?, ?)""", (title, clean_content)) element.clear() pbar.close() logging.info("Commiting database insertions...") connection.commit() connection.close()
def parseWikipedia(self, inPath, outPath, titlesPath): assert inPath != outPath self.loadTitles(titlesPath) self.outFile = None if outPath: self.outFile = codecs.open(outPath, "wt", "utf-8") compressed = inPath.endswith(".bz2") originalFile = open(inPath, "r" if compressed else "rt") if inPath.endswith(".bz2"): f = bz2file.BZ2File(originalFile, mode="r") else: f = originalFile lineNum = 0 c = codecs.iterdecode(f, "utf-8") for line in c: if lineNum % 100000 == 0: print "Processing line", lineNum, "title", ( self.numTitles, self.numSkipped), "=", self.title self.processLine(line) lineNum += 1 originalFile.close() if self.outFile: self.outFile.close()
def multi_open(name): if name.endswith('.gz'): f = gzip.open(name) elif name.endswith('.bz2'): f = bz2.BZ2File(name) else: f = open(name) return f
def get_index(path): res = set() for line in bz2file.BZ2File(path): m = re.search(('(\d+)\:\d+:.+'), line) res.add(int(m.group(1))) res = list(sorted(res, key=int)) res.append(-1) return res
def open_reader(self, filename, *args, **kwargs): """ Make a best-effort guess as to how to parse the given sequence file. Handles '-' as shortcut for stdin. Deals with .gz, FASTA, and FASTQ records. """ magic_dict = { b"\x1f\x8b\x08": "gz", b"\x42\x5a\x68": "bz2", # "\x50\x4b\x03\x04": "zip" } # Inspired by http://stackoverflow.com/a/13044946/1585509 filename = _normalize_filename(filename) bufferedfile = io.open(file=filename, mode='rb', buffering=8192) num_bytes_to_peek = max(len(x) for x in magic_dict) file_start = bufferedfile.peek(num_bytes_to_peek) compression = None for magic, ftype in magic_dict.items(): if file_start.startswith(magic): compression = ftype break if compression is 'bz2': sequencefile = bz2file.BZ2File(filename=bufferedfile) peek = sequencefile.peek(1) elif compression is 'gz': if not bufferedfile.seekable(): bufferedfile.close() raise ValueError( "gziped data not streamable, pipe through zcat \ first") peek = gzip.GzipFile(filename=filename).read(1) sequencefile = gzip.GzipFile(filename=filename) else: peek = bufferedfile.peek(1) sequencefile = bufferedfile iter_fn = None try: first_char = peek[0] except IndexError as err: return [] # empty file try: first_char = chr(first_char) except TypeError: pass if first_char == '>': iter_fn = fasta_iter elif first_char == '@': iter_fn = fastq_iter if iter_fn is None: raise ValueError("unknown file format for '%s'" % filename) self.sequencefile = sequencefile return iter_fn(sequencefile, *args, **kwargs)
def _open_bz2(filename, mode): if bz2 is None: raise ImportError("Cannot open bz2 files: The bz2 module is not available") if _PY3: return bz2.open(filename, mode) else: if mode[0] == 'a': raise ValueError("Mode '{}' not supported with BZ2 compression".format(mode)) return bz2.BZ2File(filename, mode)
def xml_to_csv(filename): # Construct dump file iterator input_file = Dump.from_file(bz2file.BZ2File(filename)) print("Processing...") # Open output file output_csv = open(filename[0:-3] + "2csv", 'w') # writing header for output csv file output_csv.write(";".join([ "page_id", "page_title", "page_ns", "revision_id", "revision_parent", "timestamp", "contributor_id", "contributor_name", "comments", "model" "bytes" ])) output_csv.write("\n") # Iterate through pages par = tqdm.tqdm() for page in input_file.pages: par.update(1) # get page info page_id = str(page.id) page_title = '|{}|'.format(page.title) page_ns = str(page.namespace) if page_id == '12': for revision in page: if revision != None: # get revision info revision_id = str(revision.id) if revision_id == '876580929': text = str(revision.text) revision_parent = '-1' if revision.parent_id == None else str( revision.parent_id) timestamp = str(revision.timestamp) revision_bytes = '-1' if revision.bytes == None else str( revision.bytes) contributor_id = str(revision.user.id) contributor_name = str(revision.user.text) comment = str(revision.comment) model = str(revision.model) revision_row = [ page_id, page_title, page_ns, revision_id, revision_parent, timestamp, contributor_id, contributor_name, comment, model, revision_bytes, text ] #~ print(revision_row) output_csv.write(";".join(revision_row) + '\n') return print("Done processing") output_csv.close() return True
def __init__(self, input_file): self.filename = input_file self.indexed = False if input_file.strip() == "-": ifile = sys.stdin elif input_file.endswith(".bz2"): try: ifile = bz2file.BZ2File(input_file, "r", buffering=0) except Exception, e: raise e
def extract_bzip2(archive, compression, cmd, verbosity, interactive, outdir): """Extract a BZIP2 archive with the bz2 Python module.""" targetname = util.get_single_outfile(outdir, archive) try: with bz2.BZ2File(archive) as bz2file: with open(targetname, 'wb') as targetfile: data = bz2file.read(READ_SIZE_BYTES) while data: targetfile.write(data) data = bz2file.read(READ_SIZE_BYTES) except Exception as err: msg = "error extracting %s to %s: %s" % (archive, targetname, err) raise util.PatoolError(msg) return None
def get_open(path, mode, file_type=None, encoding='utf-8'): def wrapper(opener): if 'r' in mode: return io.TextIOWrapper(io.BufferedReader(opener), encoding=encoding) else: return io.TextIOWrapper(opener, encoding=encoding) if file_type == 'gzip': return wrapper(gzip.GzipFile(path, mode)) if file_type == 'bz2': import bz2file return wrapper(bz2file.BZ2File(path, mode)) else: return io.open(path, mode, encoding=encoding)
def testSaveAsJSONBzip2(self): """ A DiamondTabularFormatReader must be able to save itself as bzip2'd JSON. """ mockOpener = mockOpen(read_data=DIAMOND_RECORDS) with patch.object(builtins, 'open', mockOpener): reader = DiamondTabularFormatReader('file.txt') data = BytesIO() fp = bz2file.BZ2File(data, 'w') reader.saveAsJSON(fp, writeBytes=True) fp.close() self.assertEqual(compress(DIAMOND_RECORDS_DUMPED.encode('UTF-8')), data.getvalue())
def _read_json(self, path_or_url, compressed=True, advanced_path=False): ''' Load JSON for a path. Allows remote files in addition to local ones. ''' if parse_url(path_or_url).scheme in ['http', 'https']: try: req = _urlopen(path_or_url) filename_or_buffer = BytesIO(req.read()) except HTTPError: logging.exception("HTTP Error accessing %s" % path_or_url) raise compressed = False else: filename_or_buffer = path_or_url try: if compressed: f = bz2.BZ2File(filename_or_buffer) else: if (type(filename_or_buffer) != BytesIO) and not isinstance( filename_or_buffer, StringIO): f = codecs.open(filename_or_buffer, 'r+', encoding="utf-8") else: f = filename_or_buffer rawjson = f.readline() f.close() except IOError: logging.exception( "Can't read %s. Did you pass the incorrect " "'compressed=' argument?", path_or_url) raise except: print(compressed, type(filename_or_buffer)) logging.exception("Can't open %s", path_or_url) raise # This is a bandaid for schema version 2.0, not over-engineered # since upcoming releases of the extracted features # dataset won't keep the basic/advanced split try: # For Python3 compatibility, decode to str object if PY3 and (type(rawjson) != str): rawjson = rawjson.decode() volumejson = json.loads(rawjson) except: logging.exception( "Problem reading JSON for %s. One common reason" " for this error is an incorrect compressed= " "argument", path_or_url) raise return volumejson
def page_generator(tmp_dir, max_docs=None): doc = u"" count = 0 corpus_filepath = _maybe_download_corpus(tmp_dir) for line in bz2file.BZ2File(corpus_filepath, "r", buffering=1000000): line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8") if not doc and line != u" <page>\n": continue doc += line if line == u" </page>\n": yield doc doc = u"" count += 1 if max_docs and count >= max_docs: break
def _unpack_zip(zipfile, all_tasks): fpath = '%s%s%s' % (settings.ZIP_DIR, os.sep, zipfile) try: b = bz2file.BZ2File(fpath) tar = tarfile.open(fileobj=b) except tarfile.ReadError: error("Could not read tarfile: %s" % fpath) return mkdir(settings.STAGE_DIR) tar.extractall(settings.STAGE_DIR) tar.close() move_results(all_tasks) ziplog = settings.ZIP_DIR + os.sep + 'abed_unzipped.txt' with open(ziplog, 'a') as fid: fid.write(zipfile + '\n')
def create_bzip2(archive, compression, cmd, verbosity, interactive, filenames): """Create a BZIP2 archive with the bz2 Python module.""" if len(filenames) > 1: raise util.PatoolError( 'multi-file compression not supported in Python bz2') try: with bz2.BZ2File(archive, 'wb') as bz2file: filename = filenames[0] with open(filename, 'rb') as srcfile: data = srcfile.read(READ_SIZE_BYTES) while data: bz2file.write(data) data = srcfile.read(READ_SIZE_BYTES) except Exception as err: msg = "error creating %s: %s" % (archive, err) raise util.PatoolError(msg) return None
def compress(source, target, pagesize=4096): logging.debug("Starting compression of %s to %s", repr(source), repr(target)) logging.debug("Page size: %d", pagesize) size = os.path.getsize(source) with open(target, "wb") as ftarget: ftarget.write(util.create_header("bzip2", size)) with bz2file.BZ2File(filename=ftarget, mode="wb", compresslevel=9) as ftarget: for i, page in enumerate(util.get_pages(source, pagesize=pagesize)): if i % 100 == 0 or (i + 1) * pagesize == size: sys.stdout.write("\rProgress: {:.2f}%".format( float(i * pagesize) / size * 100)) sys.stdout.flush() ftarget.write(page) sys.stdout.write("\n") logging.debug("Done")
def check_bz2(file_path, check_content=True): try: with open(file_path, "rb") as temp: magic_check = temp.read(3) if magic_check != util.bz2_magic: return (False, False) except Exception: return (False, False) if not check_content: return (True, True) with bz2.BZ2File(file_path, mode='rb') as bzipped_file: chunk = bzipped_file.read(CHUNK_SIZE) # See if we have a compressed HTML file if check_html(chunk, file_path=False): return (True, False) return (True, True)
def handle_bz2(repository, uploaded_file_name): fd, uncompressed = tempfile.mkstemp(prefix='repo_%d_upload_bunzip2_' % repository.id, dir=os.path.dirname(uploaded_file_name), text=False) bzipped_file = bz2.BZ2File(uploaded_file_name, 'rb') while 1: try: chunk = bzipped_file.read(basic_util.CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) log.exception('Problem uncompressing bz2 data "%s"', uploaded_file_name) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() shutil.move(uncompressed, uploaded_file_name)
def open(name): """ Intended to be private to the class... A flexible open routine that can handle plain text files or files compressed with gzip or bzip2. Only used for the input files. Output files are emitted uncompressed, until the tools in the next leg of the pipeline can work properly with compressed files. :param name: The filename to open. :return: A file object for the named file. """ if name.endswith('.gz'): f = gzip.open(name) elif name.endswith('.bz2'): f = bz2.BZ2File(name) else: f = open(name) return f
def check_bz2(file_path, check_content=True): try: temp = open(file_path, "U") magic_check = temp.read(3) temp.close() if magic_check != util.bz2_magic: return (False, False) except: return (False, False) if not check_content: return (True, True) CHUNK_SIZE = 2**15 # reKb bzipped_file = bz2.BZ2File(file_path, mode='rb') chunk = bzipped_file.read(CHUNK_SIZE) bzipped_file.close() # See if we have a compressed HTML file if check_html(file_path, chunk=chunk): return (True, False) return (True, True)
def load_channels(sample, chr_list): prefix = '' channel_names = [ 'clipped_reads', 'clipped_read_distance', 'coverage', 'split_read_distance' ] channel_data = defaultdict(dict) for chrom in chr_list: logging.info('Loading data for Chr%s' % chrom) for ch in channel_names: logging.info('Loading data for channel %s' % ch) suffix = '.npy.bz2' if ch == 'coverage' else '.pbz2' if HPC_MODE: filename = "/hpc/cog_bioinf/ridder/users/smehrem/breakpoint-pairs/NA12878_channel_data/" + ch + "/" + chrom + "_" + ch + suffix else: filename = "/home/cog/smehrem/MinorResearchInternship/NA12878/" + ch + "/" + '_'.join( [chrom, ch + suffix]) assert os.path.isfile(filename) logging.info('Reading %s for Chr%s' % (ch, chrom)) with bz2file.BZ2File(filename, 'rb') as f: if suffix == '.npy.bz2': channel_data[chrom][ch] = np.load(f) else: channel_data[chrom][ch] = pickle.load(f) logging.info('End of reading') # unpack clipped_reads channel_data[chrom]['read_quality'], channel_data[chrom]['clipped_reads'], \ channel_data[chrom]['clipped_reads_inversion'], channel_data[chrom]['clipped_reads_duplication'], \ channel_data[chrom]['clipped_reads_translocation'] = channel_data[chrom]['clipped_reads'] # unpack split_reads channel_data[chrom]['split_read_distance'], \ channel_data[chrom]['split_reads'] = channel_data[chrom]['split_read_distance'] return channel_data
def open_raw_data(filepath, binary=True): ''' Open the input file which may be compressed. :param filepath: Path of raw data file which can either be zip, bz2 or uncompressed. :type filepath: str :returns: An opened file object. :rtype: file ''' extension = os.path.splitext(filepath)[1].lower() if extension in {'.sac', '.zip'}: zf = zipfile.ZipFile(filepath, 'r') filenames = zf.namelist() if len(filenames) != 1: raise IOError('Zip files must contain only a single data file.') return zf.open(filenames[0]) if extension in {'.bz2'}: return bz2.BZ2File(filepath, 'r') return open(filepath, 'rb' if binary else 'r')
def page_generator(tmp_dir, max_docs=None): """ Generate cleaned wikipedia articles as a string. """ doc = u"" count = 0 corpus_filepath = _maybe_download_corpus(tmp_dir) for line in bz2file.BZ2File(corpus_filepath, "r", buffering=1000000): line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8") if not doc and line != u" <page>\n": continue doc += line if line == u" </page>\n": doc_text = _page_text(doc) if doc_text != None: parsed_text = mwparserfromhell.parse(doc_text) \ .strip_code(normalize=True, collapse=True) yield parsed_text doc = u"" count += 1 if max_docs and count >= max_docs: break
def get_fileobj(filename, mode="r", compressed_formats=None): """ Returns a fileobj. If the file is compressed, return an appropriate file reader. In text mode, always use 'utf-8' encoding. :param filename: path to file that should be opened :param mode: mode to pass to opener :param compressed_formats: list of allowed compressed file formats among 'bz2', 'gzip' and 'zip'. If left to None, all 3 formats are allowed """ if compressed_formats is None: compressed_formats = ['bz2', 'gzip', 'zip'] # Remove 't' from mode, which may cause an error for compressed files mode = mode.replace('t', '') # the various compression readers don't support 'U' mode, # so we open in 'r'. if mode == 'U': cmode = 'r' else: cmode = mode if 'gzip' in compressed_formats and is_gzip(filename): fh = gzip.GzipFile(filename, cmode) elif 'bz2' in compressed_formats and is_bz2(filename): fh = bz2.BZ2File(filename, cmode) elif 'zip' in compressed_formats and zipfile.is_zipfile(filename): # Return fileobj for the first file in a zip file. with zipfile.ZipFile(filename, cmode) as zh: fh = zh.open(zh.namelist()[0], cmode) elif 'b' in mode: return open(filename, mode) else: return io.open(filename, mode, encoding='utf-8') if 'b' not in mode: return io.TextIOWrapper(fh, encoding='utf-8') else: return fh
def inspect_pairs(candidate_pairs, outFile): final_pairs = set() # from bp1 point of view bp_dict = defaultdict(dict) bp_list = [] for sv in candidate_pairs: bp1, bp2 = sv.tuple bp_id = bp1.id() bp2_id = '_'.join([bp2.chr, bp2.strand]) if bp2_id not in bp_dict[bp_id]: bp_dict[bp_id] = defaultdict(list) bp_dict[bp_id][bp2_id].append(bp2.pos) bp_list.append(bp_id) bp_cnt = Counter(bp_list) min_support_bp = [k for (k, v) in bp_cnt.items() if v >= min_support] logging.info('Min %d supported positions bp1: %d/%d' % (min_support, len(min_support_bp), len(bp_cnt))) for bp1_id in min_support_bp: bp1_chr, bp1_pos, bp1_strand = bp1_id.split('_') for bp2_id in bp_dict[bp1_id]: bp2_chr, bp2_strand = bp2_id.split('_') if len(bp_dict[bp1_id][bp2_id]) >= min_support: bp2_pos = max( bp_dict[bp1_id][bp2_id]) if bp1_strand == '+' else min( bp_dict[bp1_id][bp2_id]) final_pairs.add( StructuralVariant( Breakpoint(bp1_chr, int(bp1_pos), bp1_strand), Breakpoint(bp2_chr, int(bp2_pos), bp2_strand))) logging.info('Length of pair set after BP1 perspective: %d' % len(final_pairs)) # from bp2 point of view bp_dict = defaultdict(dict) bp_list = [] for sv in candidate_pairs: bp1, bp2 = sv.tuple bp_id = bp2.id() bp1_id = '_'.join([bp1.chr, bp1.strand]) if bp1_id not in bp_dict[bp_id]: bp_dict[bp_id] = defaultdict(list) bp_dict[bp_id][bp1_id].append(bp1.pos) bp_list.append(bp_id) bp_cnt = Counter(bp_list) min_support_bp = [k for (k, v) in bp_cnt.items() if v >= min_support] logging.info('Min %d supported positions bp2: %d/%d' % (min_support, len(min_support_bp), len(bp_cnt))) for bp1_id in min_support_bp: bp1_chr, bp1_pos, bp1_strand = bp1_id.split('_') for bp2_id in bp_dict[bp1_id]: bp2_chr, bp2_strand = bp2_id.split('_') if len(bp_dict[bp1_id][bp2_id]) >= min_support: bp2_pos = max( bp_dict[bp1_id][bp2_id]) if bp1_strand == '+' else min( bp_dict[bp1_id][bp2_id]) final_pairs.add( StructuralVariant( Breakpoint(bp1_chr, int(bp1_pos), bp1_strand), Breakpoint(bp2_chr, int(bp2_pos), bp2_strand))) logging.info('Length of pair set after BP2 perspective: %d' % len(final_pairs)) # Write the output in pickle format with bz2file.BZ2File(outFile, 'wb') as f: pickle.dump(final_pairs, f)
def compress(source, target, reference, nointra, delta, inner, pagesize=4096): # some info logging.debug("Starting compression of %s to %s", repr(source), repr(target)) logging.debug("Page size: %d", pagesize) logging.debug("Reference dump: %s", reference) # pages + page numbers bookkeeping reference_pages, reference_pagenrs = [], {} for i, page in enumerate(util.get_pages(reference)): reference_pages.append(page) if page not in reference_pagenrs: reference_pagenrs[page] = i reference_pages_set = set(reference_pages) # find new + duplicatable pages dedups = dd(list) diffs = dd() diff_seen = set() if nointra: new_pagenrs = [] else: new_pagenrs = dd(list) new_pages = [] same_distinct, same_total = set(), 0 source_pages = [] for i, page in enumerate(util.get_pages(source)): source_pages.append(page) if reference_pages[i] != page: if page not in reference_pages_set: if delta is not None: d = util.create_diff(reference_pages[i], page) if d is not None: diff_seen.add(page) diffs[i] = d continue if nointra: new_pagenrs.append(i) else: new_pagenrs[page].append(i) new_pages.append(page) else: dedups[page].append(i) else: same_total += 1 same_distinct.add(page) source_pages_set = set(source_pages) newpagescnt = len(new_pages), len(set(new_pages)) # intervalize if nointra: new_pagenrs = util.intervalize(new_pagenrs) else: new_pagenrs = { page: util.intervalize(new_pagenrs[page]) for page in new_pagenrs } dedups = {page: util.intervalize(dedups[page]) for page in dedups} # write file util.create_dir(".tmp") tmphandle, tmpfile = tempfile.mkstemp(dir=".tmp") try: with open(tmpfile, "wb") as ftmp: ftmp.write(reference + "\x00") inorder = [] seen = set() for page in reference_pages: if page in dedups and page not in seen: inorder.append(page) seen.add(page) util.create_pagenr_list( [reference_pagenrs[page] for page in inorder], ftmp) for page in inorder: ftmp.write(util.create_interval_list(dedups[page])) if delta is not None: util.create_pagenr_list(sorted(diffs), ftmp) for pagenr in sorted(diffs): ftmp.write(diffs[pagenr]) if nointra: ftmp.write(util.create_interval_list(new_pagenrs)) for page in new_pages: ftmp.write(page) else: ftmp.write(struct.pack("<I", len(new_pagenrs))) for page in new_pagenrs: ftmp.write(util.create_interval_list(new_pagenrs[page])) for page in new_pagenrs: ftmp.write(page) with open(tmpfile, "rb") as ftmp, open(target, "wb") as ftarget: ftarget.write( util.create_header(create_method_name(nointra, delta, inner), os.path.getsize(source))) ftarget.flush() if inner is None: shutil.copyfileobj(ftmp, ftarget) elif inner == "gzip": with gzip.GzipFile(fileobj=ftarget, mode="wb", compresslevel=9) as ftarget: shutil.copyfileobj(ftmp, ftarget) elif inner == "bzip2": with bz2file.BZ2File(filename=ftarget, mode="wb", compresslevel=9) as ftarget: shutil.copyfileobj(ftmp, ftarget) elif inner == "7zip": p = subprocess.Popen( ["7za", "a", "-an", "-txz", "-mx=9", "-si", "-so", source], stdin=ftmp, stdout=ftarget, stderr=subprocess.PIPE) p.communicate() finally: os.close(tmphandle) os.remove(tmpfile) # some info dedup_distinct = len(set(dedups.keys()) | same_distinct) dedup_total = same_total + sum(b - a + 1 for l in dedups.values() for a, b in l) logging.debug("Deduplicated pages at the same offset: %d/%d (%d/%d)", same_total, len(source_pages), len(same_distinct), len(source_pages_set)) logging.debug("Deduplicated pages at different offsets: %d/%d (%d/%d)", dedup_total - same_total, len(source_pages), len(dedups), len(source_pages_set)) logging.debug("Deduplicated pages in total: %d/%d (%d/%d)", dedup_total, len(source_pages), dedup_distinct, len(source_pages_set)) if delta is not None: logging.debug("Diffed pages: %d/%d (%d/%d)", len(diffs), len(source_pages), len(diff_seen), len(source_pages_set)) logging.debug("New pages: %d/%d (%d/%d)", newpagescnt[0], len(source_pages), newpagescnt[1], len(source_pages_set)) logging.debug("Done") return 0