def scrape(parallelity=4, postgres_conn_string="dbname=themenstadtplan"): GRANULARITY = 10. XMIN = 5400499. XMAX = 5427620. YMIN = 5649580. YMAX = 5672231. # kleinerer Testausschnitt # XMIN = 5412133. # XMAX = 5415148. # YMIN = 5656993. # YMAX = 5657370. BASEURL = "http://stadtplan2.dresden.de/(S(jcksai1ssxvzjbveqyv1aizy))/ajaxpro/ASP.spdd_controls_mapcontainer_ascx,App_Web_o3sebx4k.ashx" HEADERS = {"X-AjaxPro-Method": "GetMapTipEx", "Origin": "http://stadtplan2.dresden.de", "Accept-Encoding": "gzip,deflate,sdch", "Accept-Language": "de-DE", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Content-Type": "text/plain; charset=UTF-8", "Accept": "*/*", "Referer": "http://stadtplan2.dresden.de/(S(jcksai1ssxvzjbveqyv1aizy))/spdd.aspx", "Cookie": "cardo3SessionGuid=C3_4ef68a3a-59ca-4a0c-b3cc-722fdabb1c84", "Connection": "keep-alive"} BASEREQ = '{"applicationContextType":null,"themeList":"6.1121|278.1397_6.2/278.1397|318.105_6.2/318.105|320.1580_6.2/320.1580|6.1286|6.1388|6.1478|6.640|6.1444|6.1006|6.641|6.613|6.642|6.1194|6.1257|6.343|6.854|6.22|6.130|6.103|6.128|6.377|6.379|6.565|6.105|6.108|6.109|6.117|6.106|6.107|6.177|6.178|6.994|6.118|6.119|6.1580|6.1582|6.376|6.431|6.432|6.1325|6.1321|6.1322|6.1323|6.777|6.783|6.778|6.779|6.775|6.782|6.784|6.785|6.769|6.770|6.771|6.773|6.772|6.774|6.780|6.781|6.910|6.776|6.768|6.363|6.429|6.498|6.426|6.1011|6.1523|6.511|6.542|6.144|6.183|6.187|6.189|6.360|6.357|6.592|6.359|6.358|6.356|6.444|6.477|6.483|6.470|6.471|6.478|6.472|6.479|6.473|6.480|6.474|6.481|6.484|6.475|6.476|6.482|6.1483|6.1048|6.147|6.146|6.1214|6.196|6.967|6.1407|6.1403|6.1418|6.1318|6.142|6.437|6.441|6.442|6.434|6.435|6.443|6.436|6.539|6.529|6.531|6.532|6.533|6.534|6.535|6.536|6.537|6.622|6.1414|6.1413|6.1195|6.597|6.594|6.596|6.599|6.595|6.598|6.1310|6.1320|6.1324|6.447|6.445|6.1294|6.1295|6.191|6.192|6.446|6.1293|6.193|6.194|6.450|6.451|6.452|6.448|6.449|6.430|6.993|6.566|6.197|6.831|6.807|6.578|6.638|6.1436|6.1440|6.614|6.1359|6.1445|6.1449|6.954|6.1080|6.839|6.1408|6.1395|6.1312|6.127|6.1524|6.1552|6.1049|6.1326|279.159_6.2/279.159|6.992|6.524|6.525|6.123|6.121|6.122|6.1046|6.1397|6.1554|6.1555|6.1047|6.795|6.796|6.767|6.871|6.870|6.935|6.701|6.341|6.152|6.1369|6.153|6.148|6.1562|6.42|6.1505|6.975|6.1461|6.1460|6.1459|6.1489|6.1482|6.846|6.886|6.887|6.888|6.889|6.885|6.902|6.900|6.901|6.949|6.950|6.951|6.286|6.289|6.295|6.296|6.1231|6.1232|6.1368|6.1547|6.590|6.805|6.1251|6.1075|6.1073|6.586|6.1591|6.1217|6.1216|6.1225|6.422|6.407|6.408|6.200|6.203|6.204|6.409|6.410|6.411|6.205|6.206|6.207|6.208|6.209|6.210|6.316|6.847|6.1307|6.486|6.601|6.463|6.233|6.236|6.234|6.237|6.235|6.159|6.232|6.283|6.47|6.1129|6.861|6.1192|6.545|6.325|6.1556|6.372|6.371|6.1052|6.369|6.370|6.983|6.1317|6.1338|6.1468|6.427|6.648|6.647|6.649|6.653|6.654|6.652|6.651|6.876|6.877|6.176|6.125|$BaseMap#5#Tiles","gx":%.6f,"gy":%.6f,"srs":31469,"currentMapScale":1000}' progressbar = AnimatedProgressBar(end=number_of_queries(XMIN, XMAX, YMIN, YMAX, GRANULARITY), width=80) with psycopg2.connect(postgres_conn_string) as pg_conn: pg_conn.autocommit = True with pg_conn.cursor() as pg_cursor: for query in build_queries(XMIN, XMAX, YMIN, YMAX, basereq=BASEREQ, granularity=GRANULARITY): res = requests.post(BASEURL, data=query, headers=HEADERS) handle_json_response(res.json(), pg_cursor) progressbar + 1 progressbar.show_progress()
def testtwo(): p = AnimatedProgressBar(end=100, width=80) while True: p + 5 p.show_progress() time.sleep(0.1) if p.progress == 100: break
def hashing(disk_path, meta_path, chunk_size=4096, window_size=512, print_out=None): # TODO: need more efficient implementation, e.g. bisect # generate hash of base disk # disk_path : raw disk path # chunk_size : hash chunk size # window_size : slicing window size # print_out : progress bar if print_out: print_out.write("[INFO] Start VM Disk hashing\n") prog_bar = AnimatedProgressBar(end=100, width=80, stdout=print_out) total_iteration = os.path.getsize(disk_path)/window_size iter_count = 0 prog_interval = 100 disk_file = open(disk_path, "rb") out_file = open(meta_path, "w+b") data = disk_file.read(chunk_size) if (not data) or len(data) < chunk_size: raise DiskError("invalid raw disk size") entire_hashing = sha256() entire_hashing.update(data) s_offset = 0 data_len = len(data) hash_dic = dict() while True: if print_out: if (iter_count)%prog_interval == 0: prog_bar.process(100.0*prog_interval/total_iteration) prog_bar.show_progress() iter_count += 1 hashed_data = sha256(data).digest() if hash_dic.get(hashed_data) == None: hash_dic[hashed_data]= (hashed_data, s_offset, data_len) added_data = disk_file.read(window_size) if (not added_data) or len(added_data) != window_size: print "" break s_offset += window_size data = data[window_size:] + added_data entire_hashing.update(added_data) for hashed_data, s_offset, data_len in list(hash_dic.values()): out_file.write(struct.pack("!QI%ds" % len(hashed_data), s_offset, data_len, hashed_data)) disk_file.close() out_file.close() return entire_hashing.hexdigest()
def hashing(disk_path, meta_path, chunk_size=4096, window_size=512): # TODO: need more efficient implementation, e.g. bisect # generate hash of base disk # disk_path : raw disk path # chunk_size : hash chunk size # window_size : slicing window size prog_bar = AnimatedProgressBar(end=100, width=80, stdout=sys.stdout) total_iteration = os.path.getsize(disk_path) / window_size iter_count = 0 prog_interval = 100 disk_file = open(disk_path, "rb") out_file = open(meta_path, "w+b") data = disk_file.read(chunk_size) if (not data) or len(data) < chunk_size: raise DiskError("invalid raw disk size") entire_hashing = sha256() entire_hashing.update(data) s_offset = 0 data_len = len(data) hash_dic = dict() while True: if (iter_count) % prog_interval == 0: prog_bar.process(100.0 * prog_interval / total_iteration) prog_bar.show_progress() iter_count += 1 hashed_data = sha256(data).digest() if hash_dic.get(hashed_data) == None: hash_dic[hashed_data] = (hashed_data, s_offset, data_len) added_data = disk_file.read(window_size) if (not added_data) or len(added_data) != window_size: break s_offset += window_size data = data[window_size:] + added_data entire_hashing.update(added_data) for hashed_data, s_offset, data_len in list(hash_dic.values()): out_file.write( struct.pack("!QI%ds" % len(hashed_data), s_offset, data_len, hashed_data)) disk_file.close() out_file.close() return entire_hashing.hexdigest()
def checkForDuplicates(self, path, duplicateCollector, hash = hashlib.sha1): hashes = {} progressBar = AnimatedProgressBar(end = self._getFileCount(path)) progressBar.show_progress() for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: progressBar + 1 fullPath = os.path.join(dirpath, filename) hashObj = hash() for chunk in self.chunkReader(open(fullPath, 'rb')): hashObj.update(chunk) fileID = (hashObj.digest(), os.path.getsize(fullPath)) duplicate = hashes.get(fileID, None) if duplicate: duplicateCollector.add(fullPath, duplicate) # print 'Duplicate found: %s and %s' %(fullPath, duplicate) else: hashes[fileID] = fullPath progressBar.show_progress() progressBar.done()
def _get_mem_hash(self, fin, start_offset, end_offset, hash_list, **kwargs): # kwargs # diff: compare hash_list with self object # print_out: log/process output # free_pfn_dict: free memory physical frame number as a dictionary {'#':1, ... } diff = kwargs.get("diff", None) apply_free_memory = kwargs.get("apply_free_memory", True) free_pfn_dict = kwargs.get("free_pfn_dict", None) print_out = kwargs.get("print_out", open("/dev/null", "w+b")) print_out.write("[INFO] Get hash list of memory page\n") prog_bar = AnimatedProgressBar(end=100, width=80, stdout=print_out) fin.seek(start_offset) total_size = end_offset-start_offset ram_offset = 0 freed_page_counter = 0 while total_size != ram_offset: data = fin.read(Memory.RAM_PAGE_SIZE) if not diff: hash_list.append((ram_offset, len(data), sha256(data).digest())) else: # compare input with hash or corresponding base memory, save only when it is different self_hash_value = self.hash_list[ram_offset/Memory.RAM_PAGE_SIZE][2] if self_hash_value != sha256(data).digest(): is_free_memory = False if (free_pfn_dict != None) and \ (free_pfn_dict.get(long(ram_offset/Memory.RAM_PAGE_SIZE), None) == 1): is_free_memory = True if is_free_memory and apply_free_memory: # Do not compare. It is free memory freed_page_counter += 1 else: #get xdelta comparing self.raw source_data = self.get_raw_data(ram_offset, len(data)) #save xdelta as DeltaItem only when it gives smaller try: patch = tool.diff_data(source_data, data, 2*len(source_data)) if len(patch) < len(data): delta_item = DeltaItem(DeltaItem.DELTA_MEMORY, ram_offset, len(data), hash_value=sha256(data).digest(), ref_id=DeltaItem.REF_XDELTA, data_len=len(patch), data=patch) else: raise IOError("xdelta3 patch is bigger than origianl") except IOError as e: #print "[INFO] xdelta failed, so save it as raw (%s)" % str(e) delta_item = DeltaItem(DeltaItem.DELTA_MEMORY, ram_offset, len(data), hash_value=sha256(data).digest(), ref_id=DeltaItem.REF_RAW, data_len=len(data), data=data) hash_list.append(delta_item) # memory over-usage protection if len(hash_list) > Memory.RAM_PAGE_SIZE*1000000: # 400MB for hashlist raise MemoryError("possibly comparing with wrong base VM") ram_offset += len(data) # print progress bar for every 100 page if (ram_offset % (Memory.RAM_PAGE_SIZE*100)) == 0: prog_bar.set_percent(100.0*ram_offset/total_size) prog_bar.show_progress() prog_bar.finish() return freed_page_counter
def testone(): progress = AnimatedProgressBar(end=FILESIZE, width=50) for i in range(0, FILESIZE, CHUNK): progress + CHUNK progress.show_progress()
def _get_mem_hash(self, fin, end_offset, hash_list, **kwargs): # kwargs # diff: compare hash_list with self object # free_pfn_dict: free memory physical frame number as a dictionary {'#':1, ... } diff = kwargs.get("diff", None) apply_free_memory = kwargs.get("apply_free_memory", True) free_pfn_dict = kwargs.get("free_pfn_dict", None) LOG.info("Get hash list of memory page") prog_bar = AnimatedProgressBar(end=100, width=80, stdout=sys.stdout) total_size = end_offset ram_offset = 0 freed_page_counter = 0 base_hashlist_length = len(self.hash_list) while total_size != ram_offset: data = fin.read(Memory.RAM_PAGE_SIZE) if not diff: hash_list.append((ram_offset, len(data), sha256(data).digest())) else: # compare input with hash or corresponding base memory, save only when it is different hash_list_index = ram_offset/Memory.RAM_PAGE_SIZE if hash_list_index < base_hashlist_length: self_hash_value = self.hash_list[hash_list_index][2] else: self_hash_value = None if self_hash_value != sha256(data).digest(): is_free_memory = False if (free_pfn_dict != None) and \ (free_pfn_dict.get(long(ram_offset/Memory.RAM_PAGE_SIZE), None) == 1): is_free_memory = True if is_free_memory and apply_free_memory: # Do not compare. It is free memory freed_page_counter += 1 else: #get xdelta comparing self.raw source_data = self.get_raw_data(ram_offset, len(data)) #save xdelta as DeltaItem only when it gives smaller try: if source_data == None: raise IOError("launch memory snapshot is bigger than base vm") patch = tool.diff_data(source_data, data, 2*len(source_data)) if len(patch) < len(data): delta_item = DeltaItem(DeltaItem.DELTA_MEMORY, ram_offset, len(data), hash_value=sha256(data).digest(), ref_id=DeltaItem.REF_XDELTA, data_len=len(patch), data=patch) else: raise IOError("xdelta3 patch is bigger than origianl") except IOError as e: #LOG.info("xdelta failed, so save it as raw (%s)" % str(e)) delta_item = DeltaItem(DeltaItem.DELTA_MEMORY, ram_offset, len(data), hash_value=sha256(data).digest(), ref_id=DeltaItem.REF_RAW, data_len=len(data), data=data) hash_list.append(delta_item) # memory over-usage protection if len(hash_list) > Memory.RAM_PAGE_SIZE*1000000: # 400MB for hashlist raise MemoryError("possibly comparing with wrong base VM") ram_offset += len(data) # print progress bar for every 100 page if (ram_offset % (Memory.RAM_PAGE_SIZE*100)) == 0: prog_bar.set_percent(100.0*ram_offset/total_size) prog_bar.show_progress() prog_bar.finish() return freed_page_counter