def main(): parser = argparse.ArgumentParser('Update the icomoon icon font from the provided archive') parser.add_argument('archive', help='Path to .zip file generated by icomoon') args = parser.parse_args() script_dir = os.path.dirname(os.path.abspath(__file__)) vendor_style_dir = script_dir + '/../h/static/styles/vendor' icon_font_archive = ZipFile(args.archive) icon_font_archive.extract('selection.json', vendor_style_dir + '/fonts') icon_font_archive.extract('fonts/h.woff', vendor_style_dir) css_input_file = icon_font_archive.open('style.css') css_output_file = open(vendor_style_dir + '/icomoon.css', 'w') for line in css_input_file: if "format('woff')" in line: # inline the WOFF format file woff_content = icon_font_archive.open('fonts/h.woff').read() woff_src_line = """ /* WARNING - the URL below is inlined * because the CSS asset pipeline is not correctly rebasing * URLs when concatenating files together. * * See issue #2571 */ src:url('data:application/font-woff;base64,%s') format('woff'); """ css_output_file.write(woff_src_line % b64encode(woff_content)) elif "url(" in line: # skip non-WOFF format fonts pass else: css_output_file.write(line)
def parse(self, file): epub = ZipFile(file) if epub.read('mimetype') != 'application/epub+zip'.encode('ascii'): raise BadEPub with epub.open('META-INF/container.xml') as container_file: container = etree.parse(container_file).getroot() rootfiles = container.find('./cnt:rootfiles', NS_MAP) for rootfile in rootfiles.findall('./cnt:rootfile', NS_MAP): if rootfile.get('media-type') != 'application/oebps-package+xml': raise BadEPub content_path = rootfile.get('full-path') break # only try the first rootfile content_dir = os.path.dirname(content_path) flowables = [] with epub.open(content_path) as content_file: package = etree.parse(content_file).getroot() metadata = package.find('./opf:metadata', NS_MAP) print(metadata.find('./dc:title', NS_MAP).text) print(metadata.find('./dc:creator', NS_MAP).text) manifest = package.find('./opf:manifest', NS_MAP) items = {item.get('id'): item for item in manifest.findall('./opf:item', NS_MAP)} spine = package.find('./opf:spine', NS_MAP) for itemref in spine.findall('./opf:itemref', NS_MAP): item = items[itemref.get('idref')] filename = os.path.join(content_dir, item.get('href')) if filename.endswith('pt04.html'): break print(filename) with epub.open(filename) as xhtml_file: xhtml_parser = elementtree.Parser(CustomElement) xhtml_tree = xhtml_parser.parse(xhtml_file) for flowable in self.from_doctree(xhtml_tree.getroot()): flowables.append(flowable) return flowables
def populate(show): if show in populated: return else: populated.append(show) showId = getTvdbId(show) endpoint = 'http://thetvdb.com/api/{apikey}/series/{showId}/all/en.zip'.format(apikey=api_key, showId=showId) r = requests.get(endpoint) z = ZipFile(BytesIO(r.content)) dataFile = z.open('en.xml') data = BeautifulSoup(dataFile.read(),'lxml') dataFile.close() bannerFile = z.open('banners.xml') banners = BeautifulSoup(bannerFile.read(),'lxml') bannerFile.close() box_url = '' for b in banners.find_all('banner'): if b.bannertype.string == 'fanart': box_url = 'http://thetvdb.com/banners/'+b.bannerpath.string break localCache[show]['box_url'] = str(box_url)[:] seasons = set([int(s.string) for s in data.find_all('seasonnumber') if s.string != '0']) for season in seasons: localCache[show]['seasons'][int(season)+0] = {'episodes':{}} episodes = [e for e in data.find_all('episode') if int(e.seasonnumber.string) == season] for episode in episodes: number = int(episode.episodenumber.string)+0 epName = str(episode.episodename.string)[:] epThumb = 'http://thetvdb.com/banners/'+str(episode.filename.string)[:] localCache[show]['seasons'][season]['episodes'][int(number)+0] = { 'name':str(epName)[:], 'thumb_url':str(epThumb)[:] } close()
def handle_label(self, label, **options): zip = ZipFile(label) map = {} map['users'] = self.import_users(zip.open("Users.xml")) map['questions'], map['answers'] = self.import_posts(zip.open("Posts.xml"))
def _read(self) : global MANIFEST_FNAME z = ZipFile(self.fname, 'r', compression=self.compression) def _err(msg) : z.close() raise GluttonImportantFileNotFoundError(msg) # without the manifest all is lost # we need this to get the names of the other # XML files if MANIFEST_FNAME not in z.namelist() : _err('manifest not found in %s' % self.fname) self.metadata = json.load(z.open(MANIFEST_FNAME)) self.log.info("read manifest - created on %s using glutton version %.1f" % \ (time.strftime('%d/%m/%y at %H:%M:%S', time.localtime(self.download_time)), \ self.version)) # the data file is the raw data grouped into gene families # when we do a local alignment we need to get the gene id # of the best hit and find out which gene family it belongs to if self.metadata['data-file'] not in z.namelist() : _err('data file (%s) not found in %s' % (self.metadata['data-file'], self.fname)) self.data = json_to_glutton(json.load(z.open(self.metadata['data-file']))) self.seq2famid = self._create_lookup_table(self.data) self.log.info("read %d gene families (%d genes)" % (len(self.data), len(self.seq2famid))) z.close()
def unzip(filename): z = ZipFile(filename) names = z.namelist() for path in names: if path.startswith('__MACOSX/'): continue base, name = os.path.split(path) if name.startswith('._') and\ '%s/' % name.replace('._', '', 1) in names: continue double = os.path.join('__MACOSX', base, '._' + name) if double in names: print '=> %s.bin' % path info = z.getinfo(path) bin = MacBinary(name) bin.data = z.open(path, 'r').read() bin.res = z.open(double, 'r').read() modified = datetime.datetime(*info.date_time) bin.modified = time.mktime(modified.timetuple()) bin.created = time.time() if not os.path.exists(base): os.makedirs(base) with open('%s.bin' % path.rstrip('\r'), 'wb') as f: f.write(bin.encode()) else: print '-> %s' % path z.extract(path)
def read_jar(jar): zf = symname = version = lastmod = cmtid = cmttime = origin = cmtdesc = branch = None try: zf = ZipFile(jar) manifest_f = None try: manifest_f = zf.open('META-INF/MANIFEST.MF') manifest = manifest_f.read() symname = re_search(r'Bundle-SymbolicName: (.*?)(?:\s|;)', manifest) version = re_search(r'Bundle-Version: (.*?)(?:\s|;)', manifest) lastmod = re_search(r'Bnd-LastModified: (.*?)(?:\s|;)', manifest) except: traceback.print_exc() finally: if manifest_f: manifest_f.close() gitprops_f = None try: gitprops_f = zf.open('git.properties') gitprops = gitprops_f.read() cmtid = re_search(r'git.commit.id.abbrev=(.*?)\n', gitprops) cmttime = re_search(r'git.commit.time=(.*?)\n', gitprops) origin = re_search(r'git.remote.origin.url=(.*?)\n', gitprops) cmtdesc = re_search(r'git.commit.id.describe=(.*?)\n', gitprops) branch = re_search(r'git.branch=(.*?)\n', gitprops) except KeyError: pass except: traceback.print_exc() finally: if gitprops_f: gitprops_f.close() finally: if zf: zf.close() return (symname, version, lastmod, cmtid, cmttime, origin, cmtdesc, branch)
def handle_noargs(self, **options): is_verbose = options['verbosity'] > 0 if is_verbose: print "Syncing into", self._static_root print "Getting zip from ", self.ZIP_URL zip_url = urllib2.urlopen(self.ZIP_URL) zip_file = ZipFile(StringIO(zip_url.read())) for member in zip_file.namelist(): # we take only dist, css, and img directories dir_name, file_name = os.path.split(member) # skip directories if not file_name: continue _, base_dir = os.path.split(dir_name) if file_name == self._html_file: if is_verbose: print "Adopting ", self._html_file # adopt the html to template source = zip_file.open(member) content = "{% load static from staticfiles %}\n" + source.read() for orig, replacement in self._replacements: content = content.replace(orig, replacement) source.close() target = os.path.join(self._templates_root, 'agendas', self._html_file) with open(target, 'w') as f: f.write(content) elif base_dir in self._DIRS: target_dir = self._DIRS[base_dir] if is_verbose: print "Copying {0} to {1}".format(member, target_dir) # make sure we have the target_dir dir try: os.makedirs(target_dir) except OSError: pass source = zip_file.open(member) target = file(os.path.join(target_dir, file_name), "wb") shutil.copyfileobj(source, target) source.close() target.close()
class Feed(object): """A collection of CSV files with headers, either zipped into an archive or loose in a folder.""" def __init__(self, filename, strip_fields=True): self.filename = filename self.feed_name = derive_feed_name(filename) self.zf = None self.strip_fields = strip_fields self.empty_to_none = True if not os.path.isdir(filename): self.zf = ZipFile(filename) if six.PY2: self.reader = self.python2_reader else: self.reader = self.python3_reader def __repr__(self): return '<Feed %s>' % self.filename def python2_reader(self, filename): if self.zf: try: binary_file_handle = self.zf.open(filename, 'rU') except IOError: raise IOError('%s is not present in feed' % filename) else: binary_file_handle = open(os.path.join(self.filename, filename), "rb") reader = csv.reader(binary_file_handle) for row in reader: yield [six.text_type(x, 'utf-8') for x in row] def python3_reader(self, filename): if self.zf: try: text_file_handle = io.TextIOWrapper( self.zf.open(filename, "r"), encoding="utf-8") except IOError: raise IOError('%s is not present in feed' % filename) else: text_file_handle = open(os.path.join(self.filename, filename), "r", encoding="utf-8") return csv.reader(text_file_handle) def read_table(self, filename, columns): if self.strip_fields: rows = (_row_stripper(row) for row in self.reader(filename)) else: rows = self.reader(filename) if self.empty_to_none: # Set empty strings to None, let nullable handle missing values. rows = ((x if x else None for x in row) for row in rows) feedtype = filename.rsplit('/')[-1].rsplit('.')[0].title().replace('_', '') return CSV(feedtype=feedtype, rows=rows, columns=columns)
def read_single_sheet(path, name=None): """ Read an xlsx, csv or tsv from a zipfile or directory """ from zipfile import ZipFile import xlreader if name is None: root, ext = os.path.splitext(path) stream = open(path, 'rb') if ext == '.xlsx': return read_xl(stream) if ext == '.tsv': return read_csv(stream, dialect='excel-tab') if ext == '.csv': return read_csv(stream) raise ValueError('Unknown file extension for %r' % path) if path.endswith('.xlsx'): return xlreader.DictReader(open(path, 'rb'), sheetname=name) if path.endswith('.zip'): zf = ZipFile(path) names = zf.namelist() if (name + '.xlsx') in names: stream = zf.open(name + '.xlsx', 'r') return read_xl(stream) if (name + '.tsv') in names: stream = zf.open(name + '.tsv', 'rU') return read_csv(stream, dialect='excel-tab') if (name + '.csv') in names: stream = zf.open(name + '.csv', 'rU') return read_csv(stream) if os.path.isdir(path): root = os.path.join(path, name) if os.path.exists(root + '.xlsx'): stream = open(root + '.xlsx', 'rb') return read_xl(stream) if os.path.exists(root + '.tsv'): stream = open(root + '.tsv', 'rbU') return read_csv(stream, dialect='excel-tab') if os.path.exists(root + '.csv'): stream = open(root + '.csv', 'rbU') return read_csv(stream) return []
def testConvertDocy(self): """Test conversion of docy to docx and back""" x_data = Handler(self.tmp_url, open("data/test_with_image.docy").read(), "docy", **self.kw).convert("docx") self.assertIn("word/", x_data[:2000]) y_data = Handler(self.tmp_url, x_data, "docx", **self.kw).convert("docy") y_zip = ZipFile(StringIO(y_data)) y_body_data = y_zip.open("body.txt").read() self.assertTrue(y_body_data.startswith("DOCY;v10;0;"), "%r... does not start with 'DOCY;v10;0;'" % (y_body_data[:20],)) y_zip.open("media/image1.png")
def __call__(self, zipfile): zipfile = ZipFile(zipfile) filenames = zipfile.namelist() xls_files = [x for x in filenames if x.endswith('xls')] doc_files = [x for x in filenames if x.endswith('doc')] if len(xls_files) > 1: raise Exception(_("Zip file contains too many excel files")) if not xls_files: raise Exception(_("Zip file contains no excel files")) return StringIO(zipfile.open(xls_files[0]).read()), [StringIO(zipfile.open(x).read()) for x in doc_files]
def testConvertDocx(self): """Test conversion of docx to docy and back""" y_data = Handler(self.tmp_url, open("data/test_with_image.docx").read(), "docx", **self.kw).convert("docy") y_zip = ZipFile(StringIO(y_data)) y_body_data = y_zip.open("body.txt").read() self.assertTrue(y_body_data.startswith("DOCY;v10;0;"), "%r... does not start with 'DOCY;v10;0;'" % (y_body_data[:20],)) y_zip.open("media/image1.png") x_data = Handler(self.tmp_url, y_data, "docy", **self.kw).convert("docx") # magic inspired by https://github.com/minad/mimemagic/pull/19/files self.assertIn("word/", x_data[:2000])
def import_gtfs(self, gtfs_file, verbose=False): """Import a GTFS file as feed Keyword arguments: gtfs_file - A path or file-like object for the GTFS feed Returns is a list of objects imported """ z = ZipFile(gtfs_file, 'r') files = z.namelist() gtfs_order = ( ('agency.txt', Agency), ('stops.txt', Stop), ('routes.txt', Route), ('calendar.txt', Service), ('calendar_dates.txt', ServiceDate), ('shapes.txt', ShapePoint), ('trips.txt', Trip), ('stop_times.txt', StopTime), ('frequencies.txt', Frequency), ('fare_attributes.txt', Fare), ('fare_rules.txt', FareRule), ('transfers.txt', Transfer), ('feed_info.txt', FeedInfo), ) post_save.disconnect(dispatch_uid='post_save_shapepoint') post_save.disconnect(dispatch_uid='post_save_stop') try: for table_name, klass in gtfs_order: for f in files: if f.endswith(table_name): table = z.open(f, 'rU') if verbose: rows = len(list(csv.reader(table))) print("importing {x} rows of {table}".format(x=rows, table=table_name)) table = z.open(f, 'rU') klass.import_txt(table, self, verbose=verbose) finally: post_save.connect(post_save_shapepoint, sender=ShapePoint) post_save.connect(post_save_stop, sender=Stop) # Update geometries print("updating geometries...") # TODO: Add test feed that includes shapes (issue #20) for shape in self.shape_set.all(): # pragma: no cover shape.update_geometry(update_parent=False) for trip in Trip.objects.in_feed(self): trip.update_geometry(update_parent=False) for route in self.route_set.all(): route.update_geometry()
def configure_search_replace(request): if request.method == 'GET': zf_in = ZipFile(request.session['stored_archive_filename'], mode='r') all_filenames_lst = zf_in.namelist() all_filenames = set(all_filenames_lst) assert len(all_filenames) == len(all_filenames_lst), "Duplicate filenames in the input file?!" zf_in.close() return render_to_response('docx_search_replace/configure_search_replace.html', {'filenames': sorted(all_filenames)}) elif request.method == 'POST': replacements = [] for i in range(1, 6): # We have input fields "from1", "to1"... "from5", "to5" if request.POST['from%d' % i]: replacements.append((request.POST['from%d' % i], request.POST['to%d' % i])) logging.info('replacements: %s' % replacements) selected_filenames = [k for k in request.POST if request.POST[k] == 'on'] logging.info('selected_filenames: %s' % selected_filenames) zf_in = ZipFile(request.session['stored_archive_filename'], mode='r') all_filenames = zf_in.namelist() stored_output_file = tempfile.NamedTemporaryFile(delete=False) zf_out = ZipFile(stored_output_file.name, mode='w', compression=zf_in.compression) for fname in selected_filenames: file_contents = zf_in.open(fname).read().decode('utf-8') for r in replacements: file_contents = file_contents.replace(*r) zf_out.writestr(fname, file_contents.encode('utf-8')) filenames_to_copy_unchanged = set(all_filenames) - set(selected_filenames) for fname in filenames_to_copy_unchanged: zf_out.writestr(fname, zf_in.open(fname).read(), compress_type=ZIP_DEFLATED) zf_in.close() zf_out.close() orig_uploaded_filename = request.session['uploaded_filename'] if orig_uploaded_filename.endswith('.docx'): downloading_filename = re.sub('.docx$', '_EDITED.docx', orig_uploaded_filename) else: downloading_filename = orig_uploaded_filename + '_EDITED' ret_file = open(stored_output_file.name, 'rb') resp = HttpResponse(status=200, content=ret_file.read(), mimetype='application/vnd.openxmlformats-officedocument.wordprocessingml.document') resp['Content-Disposition'] = 'attachment; filename="%s"' % downloading_filename return resp else: return HttpResponseBadRequest('Invalid method: %s' % request.method)
def main(args): if len(args) < 2: print 'Usage: analyze_olp.py filename' exit(1) olpfile = ZipFile(args[1], "r") channelfile = olpfile.open('channel.labels') channels = [line.strip() for line in channelfile.readlines()] channelfile.close() chanmap = {} for i in xrange(0, len(channels)): chanmap[channels[i]] = i datafile = olpfile.open('data.ols') analyze_delays(chanmap, datafile)
def main(): pattern = re.compile(r'\d{2,}') zf = ZipFile('channel.zip') fp = zf.open('readme.txt') chain = open('chain.txt', 'w') text = fp.read() number = pattern.search(text).group(0) while True: finfo = zf.getinfo(number + '.txt') print finfo.comment print number text = zf.open(finfo).read() chain.write(finfo.comment) number = pattern.search(text).group(0)
def scan(pk3dir, basedir): """ Scan a pk3 files in a folder Check for shader conflicts and build a report of texture usages. Args: pk3dir - Path to a directory containing pk3 files to scan basedir - Path to directory of a clean game installation (basewsw) """ logger = logging.getLogger('scan') pk3path = Path(pk3dir) basepath = Path(basedir) if not pk3path.is_dir(): logger.error('{} is not a valid directory'.format(pk3path)) sys.exit(1) if not basepath.is_dir(): logger.error('{} is not a valid directory'.format(basepath)) sys.exit(1) # Build an index of base game files to check against basefiles = set() for pk3file in basepath.glob('*.pk3'): pk3zip = ZipFile(str(pk3file)) for name in pk3zip.namelist(): if name.endswith('/'): continue elif name.endswith('.shader'): basefiles.update(parse_shader(pk3zip.open(name))) else: basefiles.add(name) # Check if pk3s include same files for pk3file in pk3path.glob('*.pk3'): try: pk3zip = ZipFile(str(pk3file)) except BadZipfile: logging.error('error: {} is not a zipfile!'.format(pk3file)) continue for name in pk3zip.namelist(): if name in basefiles: logging.error('{} overwrites file {}'.format(pk3file, name)) if name.endswith('.shader'): for texture in basefiles & parse_shader(pk3zip.open(name)): logging.error('{} overwrites file {}' \ .format(pk3file, texture))
def fetch_wilt(data_home=None, download_if_missing=True, random_state=None, shuffle=False): """Load the wilt dataset, downloading it if necessary. """ URL = ('http://archive.ics.uci.edu/ml/' 'machine-learning-databases/00285/wilt.zip') data_home = get_data_home(data_home=data_home) wilt_dir = join(data_home, "wilt") samples_path = _pkl_filepath(wilt_dir, "samples") targets_path = _pkl_filepath(wilt_dir, "targets") available = exists(samples_path) if download_if_missing and not available: makedirs(wilt_dir, exist_ok=True) logger.warning("Downloading %s" % URL) f = BytesIO(urlopen(URL).read()) # ou X = np.load(f) ff = ZipFile(f, mode='r') file1 = ff.open('training.csv') Xy1 = np.genfromtxt(file1, delimiter=',', dtype=object) file2 = ff.open('testing.csv') Xy2 = np.genfromtxt(file2, delimiter=',', dtype=object) # the first row is nan: Xy1 = Xy1[1:, :] Xy2 = Xy2[1:, :] Xy = np.r_[Xy1, Xy2] X = Xy[:, 1:].astype(float) y = Xy[:, 0] joblib.dump(X, samples_path, compress=9) joblib.dump(y, targets_path, compress=9) try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] return Bunch(data=X, target=y, DESCR=__doc__)
def examine_zip(filepath): zipper = ZipFile(filepath) files = zipper.infolist() config_files = [f for f in files if f.filename.lower().endswith(".ini")] if config_files: config_file = zipper.open(config_files[0]) config = ConfigReader(file=config_file) else: config = None xml_files = [zipper.open(f) for f in files if f.filename.lower().endswith(".xml")] return (config, xml_files)
class Dataset(object): loaders = { 'adjustment': AdjustmentLoader, 'fee': FeeLoader, 'product': ProductLoader, 'rate': RateLoader, 'region': RegionLoader, } def __init__(self, f): self.zf = ZipFile(f) @cached_property def cover_sheet(self): with self.zf.open('CoverSheet.xml') as f: return CoverSheet(f) @cached_property def timestamp(self): ts = datetime.combine(self.cover_sheet.date, time.min) return timezone.make_aware(ts, timezone.get_current_timezone()) @cached_property def filename_prefix(self): return self.cover_sheet.date.strftime('%Y%m%d') def load(self): # Sort the list of loaders so that Region loads last, as a bellwether for key, loader_cls in sorted(self.loaders.items()): try: f = self.datafile(key) except KeyError: # The fees data is expected to be temporarily unavailable, # so if the fees file is not found, we skip it and # continue loading the other data types. if key == 'fee': continue raise # The zip file may be opened as binary, but we want to process the # files that it contains as text. f_text = io.TextIOWrapper(f) loader = loader_cls(f_text, data_timestamp=self.timestamp) loader.load() def datafile(self, name): filename = '{}_{}.txt'.format(self.filename_prefix, name) return self.zf.open(filename)
def aqcuire_all_resources(self, format_dict): import cStringIO as StringIO from zipfile import ZipFile # Download archive. url = self.url(format_dict) shapefile_online = self._urlopen(url) zfh = ZipFile(StringIO.StringIO(shapefile_online.read()), 'r') shapefile_online.close() # Iterate through all scales and levels and extract relevant files. modified_format_dict = dict(format_dict) scales = ('c', 'l', 'i', 'h', 'f') levels = (1, 2, 3, 4) for scale, level in itertools.product(scales, levels): modified_format_dict.update({'scale': scale, 'level': level}) target_path = self.target_path(modified_format_dict) target_dir = os.path.dirname(target_path) if not os.path.isdir(target_dir): os.makedirs(target_dir) for member_path in self.zip_file_contents(modified_format_dict): ext = os.path.splitext(member_path)[1] target = os.path.splitext(target_path)[0] + ext member = zfh.getinfo(member_path) with open(target, 'wb') as fh: fh.write(zfh.open(member).read()) zfh.close()
def load_otto_group(): """ Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition. Link: https://www.kaggle.com/c/otto-group-product-classification-challenge Returns ---------- data : array-like Pandas data frame containing the entire data set. X : array-like Training input samples. y : array-like Target values. """ file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip') z = ZipFile(file_location) data = pd.read_csv(z.open('train.csv')) data = data.set_index('id') # move the label to the first position cols = data.columns.tolist() cols = cols[-1:] + cols[0:-1] data = data[cols] X = data.iloc[:, 1:].values y = data.iloc[:, 0].values # transform the labels from strings to integers encoder = LabelEncoder() y = encoder.fit_transform(y) return data, X, y
def get_data_famafrench(name, start=None, end=None): start, end = _sanitize_dates(start, end) # path of zip files zipFileURL = "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/" url = urllib.urlopen(zipFileURL + name + ".zip") zipfile = ZipFile(StringIO(url.read())) data = zipfile.open(name + ".txt").readlines() file_edges = np.where(np.array([len(d) for d in data]) == 2)[0] datasets = {} for i in range(len(file_edges) - 1): dataset = [d.split() for d in data[(file_edges[i] + 1): file_edges[i + 1]]] if(len(dataset) > 10): ncol = np.median(np.array([len(d) for d in dataset])) header_index = np.where( np.array([len(d) for d in dataset]) == (ncol - 1))[0][-1] header = dataset[header_index] # to ensure the header is unique header = [str(j + 1) + " " + header[j] for j in range(len(header))] index = np.array( [d[0] for d in dataset[(header_index + 1):]], dtype=int) dataset = np.array( [d[1:] for d in dataset[(header_index + 1):]], dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets
def getTranslations(type, localesDir, defaultLocale, projectName, key): result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/export?key=%s' % (projectName, key)).read() if result.find('<success') < 0: raise Exception('Server indicated that the operation was not successful\n' + result) result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/download/all.zip?key=%s' % (projectName, key)).read() zip = ZipFile(StringIO(result)) dirs = {} for info in zip.infolist(): if not info.filename.endswith('.json'): continue dir, file = os.path.split(info.filename) if not re.match(r'^[\w\-]+$', dir) or dir == defaultLocale: continue if type == 'chrome' and file.count('.') == 1: origFile = file else: origFile = re.sub(r'\.json$', '', file) if type == 'gecko' and not origFile.endswith('.dtd') and not origFile.endswith('.properties'): continue mapping = langMappingChrome if type == 'chrome' else langMappingGecko for key, value in mapping.iteritems(): if value == dir: dir = key if type == 'chrome': dir = dir.replace('-', '_') data = zip.open(info.filename).read() if data == '[]': continue if not dir in dirs: dirs[dir] = set() dirs[dir].add(origFile) path = os.path.join(localesDir, dir, origFile) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) if type == 'chrome' and origFile.endswith('.json'): postprocessChromeLocale(path, data) elif type == 'chrome': data = json.loads(data) if origFile in data: fileHandle = codecs.open(path, 'wb', encoding='utf-8') fileHandle.write(data[origFile]['message']) fileHandle.close() else: fromJSON(path, data) # Remove any extra files for dir, files in dirs.iteritems(): baseDir = os.path.join(localesDir, dir) if not os.path.exists(baseDir): continue for file in os.listdir(baseDir): path = os.path.join(baseDir, file) if os.path.isfile(path) and (file.endswith('.json') or file.endswith('.properties') or file.endswith('.dtd')) and not file in files: os.remove(path)
def load_property_inspection(): """ Loads and returns several variables for the data set from Kaggle's Property Inspection Prediction competition. Link: https://www.kaggle.com/c/liberty-mutual-group-property-inspection-prediction Returns ---------- data : array-like Pandas data frame containing the entire data set. X : array-like Training input samples. y : array-like Target values. """ file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'property_inspection.zip') z = ZipFile(file_location) data = pd.read_csv(z.open('train.csv')) data = data.set_index('Id') X = data.iloc[:, 1:].values y = data.iloc[:, 0].values # transform the categorical variables from strings to integers encoder = CategoryEncoder() X = encoder.fit_transform(X) return data, X, y
def load_forest_cover(): """ Loads and returns several variables for the data set from Kaggle's Forest Cover Type Prediction competition. Link: https://www.kaggle.com/c/forest-cover-type-prediction Returns ---------- data : array-like Pandas data frame containing the entire data set. X : array-like Training input samples. y : array-like Target values. """ file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'forest_cover.zip') z = ZipFile(file_location) data = pd.read_csv(z.open('train.csv')) data = data.set_index('Id') # move the label to the first position cols = data.columns.tolist() cols = cols[-1:] + cols[0:-1] data = data[cols] X = data.iloc[:, 1:].values y = data.iloc[:, 0].values return data, X, y
def get_sightings_from_atlas(uri, species_ids): # Create a dict of sightings # Each species ID will have a list of sightings with [lat, long] sightings = dict() for species_id in species_ids: sightings[species_id] = [] # The CSV headers LONG = 0 LAT = 1 LSID = 2 # Download API call and unzip url = urlopen(uri) zipfile = ZipFile(StringIO(url.read())) # Skip the header row using [1:] for line in zipfile.open("data.csv").readlines()[1:]: sighting_record = line.split(",") sightings[sighting_record[LSID][1:-2]].append([sighting_record[LAT][1:-1],sighting_record[LONG][1:-1]]) for species_id in species_ids: # Don't return too many sightings for a single species sightings[species_id] = sightings[species_id][0:species_sighting_limit] # Prune any empty entries if sightings[species_id] == []: del sightings[species_id] return sightings
def download_unzip(input_zip): url = urllib.urlopen(input_zip) unzipped_string = '' zipfile = ZipFile(StringIO(url.read())) for name in zipfile.namelist(): unzipped_string += zipfile.open(name).read() return unzipped_string
def acquire_resource(self, target_path, format_dict): """ Downloads the zip file and extracts the files listed in :meth:`zip_file_contents` to the target path. """ import cStringIO as StringIO from zipfile import ZipFile target_dir = os.path.dirname(target_path) if not os.path.isdir(target_dir): os.makedirs(target_dir) url = self.url(format_dict) shapefile_online = self._urlopen(url) zfh = ZipFile(StringIO.StringIO(shapefile_online.read()), 'r') for member_path in self.zip_file_contents(format_dict): ext = os.path.splitext(member_path)[1] target = os.path.splitext(target_path)[0] + ext member = zfh.getinfo(member_path) with open(target, 'wb') as fh: fh.write(zfh.open(member).read()) shapefile_online.close() zfh.close() return target_path
def _zip_filehandle(filename): zipfile = ZipFile(filename) _filename = zipfile.namelist()[0] filehandle = zipfile.open(_filename) return filehandle
import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates import seaborn as sns import gc #import lightgbm as lgb import time from sklearn.metrics import r2_score import os os.chdir('C:/Users/andre/Documents/Github/Walmart-ACC') #from https://stackoverflow.com/questions/44575251/reading-multiple-files-contained-in-a-zip-file-with-pandas from zipfile import ZipFile zip_file = ZipFile('m5-forecasting-accuracy.zip') calendar_df = pd.read_csv(zip_file.open('calendar.csv')) sell_prices_df = pd.read_csv(zip_file.open('sell_prices.csv')) sales_train_validation_df = pd.read_csv(zip_file.open('sales_train_validation.csv')) sample_submission_df = pd.read_csv(zip_file.open('sample_submission.csv')) #sell_prices_df.info(memory_usage='deep') #957.5MB this is around 3x bigger than Mnist #calendar_df.info(memory_usage='deep') #sales_train_validation_df.info(memory_usage = 'deep') ############################################################################### #Memory Reduction ############################################################################### # Calendar data type cast -> Memory Usage Reduction calendar_df[["month", "snap_CA", "snap_TX", "snap_WI", "wday"]] = calendar_df[["month", "snap_CA", "snap_TX", "snap_WI", "wday"]].astype("int8") calendar_df[["wm_yr_wk", "year"]] = calendar_df[["wm_yr_wk", "year"]].astype("int16") calendar_df["date"] = calendar_df["date"].astype("datetime64")
def main(): global output_encoding, datfilecomment parser = argparse.ArgumentParser() parser.add_argument('-i', '--input-file', required=True, help='input zip file containings csv databases') parser.add_argument('-o', '--output-file', help='output GeoIP dat file') parser.add_argument('-f', '--fips-file', help='geonameid to fips code mappings') parser.add_argument( '-e', '--encoding', help='encoding to use for the output rather than utf-8') parser.add_argument('-d', '--debug', action='store_true', default=False, help='debug mode') parser.add_argument('-6', '--ipv6', action='store_const', default='IPv4', const='IPv6', help='use ipv6 database') opts = parser.parse_args() if opts.encoding: try: codecs.lookup(opts.encoding) except LookupError as e: print(e) sys.exit(1) output_encoding = opts.encoding re_entry = re.compile( r'.*?/Geo(?:Lite|IP)2-(?P<database>.*?)-(?P<filetype>.*?)-(?P<arg>.*)\.csv' ) entries = defaultdict(lambda: defaultdict(dict)) ziparchive = ZipFile(opts.input_file) for entry in ziparchive.filelist: match = re_entry.match(entry.filename) if match is None: continue db, filetype, arg = match.groups() entries[db][filetype][arg] = entry if len(entries) != 1: print('More than one kind of database found, please check the archive') sys.exit(1) # noinspection PyUnboundLocalVariable datfilecomment = '{} converted to legacy MaxMind DB with geolite2legacy'.format( os.path.dirname(entry.filename)) dbtype, entries = entries.popitem() if dbtype == 'ASN': locs = None else: if not {'Locations', 'Blocks'} <= set(entries.keys()): print('Missing Locations or Block files, please check the archive') sys.exit(1) locs = entries['Locations'].get('en') if locs is None: print('Selected locale not found in archive') sys.exit(1) locs = TextIOWrapper(ziparchive.open(locs, 'r'), encoding='utf-8') if dbtype not in RTree: print('{} not supported'.format(dbtype)) sys.exit(1) r = RTree[dbtype][opts.ipv6](debug=opts.debug) blocks = entries['Blocks'].get(opts.ipv6) if blocks is None: print('The selected block file not found in archive') sys.exit(1) if dbtype != 'ASN': fips_file = opts.fips_file or os.path.join( os.path.dirname(os.path.realpath(__file__)), 'geoname2fips.csv') parse_fips(fips_file) tstart = time() print('Database type {} - Blocks {} - Encoding: {}'.format( dbtype, opts.ipv6, output_encoding)) r.load(locs, TextIOWrapper(ziparchive.open(blocks, 'r'), encoding='utf-8')) if not opts.output_file: opts.output_file = Filenames[dbtype][opts.ipv6] print('Output file {}'.format(opts.output_file)) with open(opts.output_file, 'wb') as output: r.serialize(output) tstop = time() print( 'wrote %d-node trie with %d networks (%d distinct labels) in %d seconds' % (len(r.segments), r.netcount, len(r.data_offsets), tstop - tstart))
class XMLparser(object): """docx解析类,用于解析docx文件""" def __init__(self): self.uploadDir = os.path.join(basedir, "static/upload/htmlcoder") # upload文件夹路径 self.docxPath = self._get_docx_path() # docx文件路径 self.docxName = self._get_docx_name() # docx文件名 self.docx = ZipFile(self.docxPath, "r") # docx文件的ZipFile对象 self.documentXml = self.docx.read("word/document.xml").decode( "utf-8") # document.xml 定义了 docx 的文件结构 self.imgRelsXml = self.docx.read( "word/_rels/document.xml.rels").decode( "utf-8") # document.xml.rels 定义了 img-id-path 的映射关系 self.stylesRelsXml = self.docx.read("word/styles.xml").decode( "utf-8") # styles.xml 定义了 style-id-name 的映射关系 self.imgRels = self._get_imgRels() # 解析 img-id-path self.styleRels = self._get_styleRels() # 解析 style-id-name self.imgNames = dict() # 保存图片 MD5 文件名的字典,以 imgID 为key def _get_docx_path(self): """获得工作路径下指定目录内的docx文件路径,多个文件时按创建时间先后顺序排序""" docxDir = self.uploadDir docxDirList = [ dirName for dirName in os.listdir(docxDir) if dirName.endswith(".docx") ] #筛选docx文件 docxDirList.sort(key=lambda dirName: os.path.getctime( os.path.join(docxDir, dirName)), reverse=True) return os.path.join(docxDir, docxDirList[0]) #返回最新的docx文件名 def _get_docx_name(self): """获取从文件名中获得docx名称""" return self.docxPath[self.docxPath.rindex("/") + 1:self.docxPath.rindex(".docx")] def _get_imgRels(self): """根据xml解析img-id-path""" relDict = dict() for soup in BeautifulSoup(self.imgRelsXml, "lxml-xml").find_all("Relationship"): relId = soup.get("Id") relTarget = soup.get("Target") # 相应图片对应的target if relTarget[:5] == "media": # 保存以media开头的映射关系 relDict[relId] = relTarget return relDict def _get_styleRels(self): """根据xml解析style-id-name""" relDict = dict() for soup in BeautifulSoup(self.stylesRelsXml, "lxml-xml").find_all("w:style"): styleId = soup.get("w:styleId") styleName = soup.find("w:name").get( "w:val") # 保存style的id与其对应名称的映射关系 relDict[styleId] = styleName return relDict def get_img_path(self, imgId, local=False): """利用已生成的img-id映射关系找到相应图片的路径""" imgPath = self.imgRels.get(imgId) if local: # 本地化测试用,直接返回路径名,指向手动解压的docx文件夹 return imgPath else: imgFmt = imgPath[imgPath.rindex(".") + 1:] # 图片格式 imgName = "{}.{}".format(self._get_img_name(imgPath), imgFmt) self.imgNames[imgId] = imgName # 保存文件名 return "/static/upload/htmlcoder/%s" % imgName # 返回浏览器的访问路径 def _get_img_name(self, imgPath): """计算图片MD5并以之作为文件名返回""" imgBytes = self.docx.open(os.path.join("word", imgPath)).read() return get_MD5(imgBytes) def get_style(self, styleId): """利用已生成的style-id映射关系找到相应的style名称""" return self.styleRels.get(styleId) def write_to_file(self, filePath, xml, format=True): """测试用,将xml写入本地""" soup = BeautifulSoup(xml, "lxml") with open(filePath, "w", encoding="utf-8") as fp: if format: fp.write(soup.prettify()) else: fp.write(str(soup)) def extract_imgs(self): """解压并重命名docx中的图片""" uploadDirSet = { dirName[:dirName.rindex(".")] for dirName in os.listdir(self.uploadDir) } # 获得Upload文件夹中去扩展的文件名 for imgId, imgName in self.imgNames.items(): if imgName in uploadDirSet: # 说明是重复图片 continue else: imgPath = self.imgRels[imgId] # 获得docx文件内部的img路径 with open(os.path.join(self.uploadDir, imgName), "wb") as fp: # 将docx的图片保存到本地 fp.write( self.docx.open(os.path.join("word", imgPath)).read())
def _get_interactions(self, limit): LOG.info("getting interactions") line_counter = 0 f = '/'.join((self.rawdir, self.files['interactions']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] matchcounter = 0 with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip comment lines if re.match(r'^#', line.decode()): LOG.debug("Skipping header line") continue line_counter += 1 line = line.decode().strip() # print(line) (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a, aliases_b, detection_method, pub_author, pub_id, taxid_a, taxid_b, interaction_type, source_db, interaction_id, confidence_val) = line.split('\t') taxid_a = taxid_a.rstrip() taxid_b = taxid_b.rstrip() # get the actual gene ids, # typically formated like: gene/locuslink:351|BIOGRID:106848 gene_a_num = re.search(r'locuslink\:(\d+)\|?', interactor_a).groups()[0] gene_b_num = re.search(r'locuslink\:(\d+)\|?', interactor_b).groups()[0] if self.test_mode: graph = self.testgraph # skip any genes that don't match our test set if (int(gene_a_num) not in self.test_ids) or\ (int(gene_b_num) not in self.test_ids): continue else: graph = self.graph # when not in test mode, filter by taxon if int(taxid_a.split(':')[-1]) not in self.tax_ids or \ int(taxid_b.split(':')[-1]) not in self.tax_ids: continue else: matchcounter += 1 gene_a = 'NCBIGene:' + gene_a_num gene_b = 'NCBIGene:' + gene_b_num # get the interaction type # psi-mi:"MI:0407"(direct interaction) int_type = re.search(r'MI:\d+', interaction_type).group() rel = self.resolve(int_type, False) if rel == int_type: rel = self.globaltt['interacts with'] # scrub pubmed-->PMID prefix pub_id = re.sub(r'pubmed', 'PMID', pub_id) # remove bogus whitespace pub_id = pub_id.strip() # get the method, and convert to evidence code det_code = re.search(r'MI:\d+', detection_method).group() evidence = self.resolve(det_code, False) if evidence == det_code: evidence = self.globaltt["experimental evidence"] # note that the interaction_id is some kind of internal biogrid # identifier that does not map to a public URI. # we will construct a monarch identifier from this assoc = InteractionAssoc(graph, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence) assoc.add_source(pub_id) assoc.add_association_to_graph() if not self.test_mode and (limit is not None and line_counter > limit): break myzip.close() return
class TestGckWithDGVC(unittest.TestCase): sample_data = { "pACW": { "file": "Drosophila Gateway Vectors GCK/pACW", "name": "Construct:", "id": "Construct:", "description": "Construct: pACTIN-RW-SV", "length": 7957, "topology": "circular", "features": [ { "type": "CDS", "start": 6155, "end": 7013, "strand": 1, "label": "ampR", }, { "type": "misc_feature", "start": 5216, "end": 6071, "strand": 1, "label": "SV40 sti/polyA", }, { "type": "misc_feature", "start": 89, "end": 2662, "strand": 1, "label": "actin5C promoter", }, { "type": "CDS", "start": 3722, "end": 4400, "strand": 1, "label": "chlR", }, { "type": "CDS", "start": 4722, "end": 5025, "strand": 1, "label": "ccdB", }, { "type": "misc_feature", "start": 3489, "end": 3507, "strand": 1, "label": "attR1", }, { "type": "misc_feature", "start": 5175, "end": 5193, "strand": -1, "label": "attR2", }, { "type": "misc_feature", "start": 3489, "end": 5193, "strand": 1, "label": "Gateway cassette", }, { "type": "misc_feature", "start": 5192, "end": 5205, "strand": 1, "label": "triple STOP", }, { "type": "CDS", "start": 2763, "end": 3480, "strand": 1, "label": "ECFP", }, { "type": "misc_feature", "start": 2755, "end": 3482, "strand": 1, "label": "pACTIN-SV", }, { "type": "misc_feature", "start": 2755, "end": 3482, "strand": 1, "label": "Construct: pACTIN-RW-SV", }, ], }, "pPWF": { "file": "Drosophila Gateway Vectors GCK/pPWG", "name": "Construct:", "id": "Construct:", "description": "Construct: pPWF", "length": 12320, "topology": "circular", "features": [ { "type": "misc_feature", "start": 0, "end": 587, "strand": 1, "label": "P 5' end", }, { "type": "misc_feature", "start": 9327, "end": 9560, "strand": -1, "label": "P 3' end", }, { "type": "misc_feature", "start": 1363, "end": 4244, "strand": -1, "label": "mini-white", }, { "type": "CDS", "start": 10466, "end": 11324, "strand": 1, "label": "ampR", }, { "type": "misc_feature", "start": 7930, "end": 9314, "strand": 1, "label": "K10 terminator", }, { "type": "misc_feature", "start": 4762, "end": 4829, "strand": 1, "label": "GAGA repeats", }, { "type": "misc_feature", "start": 4855, "end": 5177, "strand": 1, "label": "GAL4 sites", }, { "type": "misc_feature", "start": 5279, "end": 5415, "strand": 1, "label": "P intron", }, { "type": "misc_feature", "start": 5184, "end": 5279, "strand": 1, "label": "P promoter", }, { "type": "misc_feature", "start": 4762, "end": 5416, "strand": 1, "label": "UASp promoter", }, { "type": "misc_feature", "start": 10060, "end": 12092, "strand": 1, "label": "pUC8", }, { "type": "misc_feature", "start": 7106, "end": 7124, "strand": -1, "label": "attR2", }, { "type": "CDS", "start": 6653, "end": 6956, "strand": 1, "label": "ccdB", }, { "type": "CDS", "start": 5653, "end": 6331, "strand": 1, "label": "chlR", }, { "type": "misc_feature", "start": 5420, "end": 7124, "strand": 1, "label": "Gateway Cassette", }, { "type": "misc_feature", "start": 5420, "end": 5438, "strand": 1, "label": "attR1", }, { "type": "CDS", "start": 7137, "end": 7854, "strand": 1, "label": "EGFP", }, { "type": "misc_feature", "start": 7129, "end": 7856, "strand": 1, "label": "pACTIN-SV", }, { "type": "misc_feature", "start": 7129, "end": 7856, "strand": 1, "label": "Construct: pACTIN-WC-SV", }, { "type": "misc_feature", "start": 5416, "end": 7875, "strand": 1, "label": "Construct: pPWF", }, ], }, } def setUp(self): # We are using the files of the Drosophila Gateway Vector Collection # (<https://emb.carnegiescience.edu/drosophila-gateway-vector-collection>) # as sample Gck files. We cannot redistribute those files along with # Biopython, so we need to download them now for the tests to run. if not os.path.exists("Gck/DGVC_GCK.zip"): try: requires_internet.check() except MissingExternalDependencyError: self.skipTest("Sample files missing and no Internet access") return try: with urlopen( "https://emb.carnegiescience.edu/sites/default/files/DGVC_GCK.zip" ) as src, open("Gck/DGVC_GCK.zip", "wb") as dst: shutil.copyfileobj(src, dst) except HTTPError: self.skipTest("Cannot download the sample files") return self.zipdata = ZipFile("Gck/DGVC_GCK.zip") def tearDown(self): self.zipdata.close() def test_read(self): """Read sample files.""" for sample in self.sample_data.values(): with self.zipdata.open(sample["file"]) as f: record = SeqIO.read(f, "gck") self.assertEqual(sample["name"], record.name) self.assertEqual(sample["id"], record.id) self.assertEqual(sample["description"], record.description) self.assertEqual(sample["length"], len(record)) self.assertEqual(sample["topology"], record.annotations["topology"]) self.assertEqual(len(sample["features"]), len(record.features)) for i, exp_feat in enumerate(sample["features"]): read_feat = record.features[i] self.assertEqual(exp_feat["type"], read_feat.type) self.assertEqual(exp_feat["start"], read_feat.location.start) self.assertEqual(exp_feat["end"], read_feat.location.end) self.assertEqual(exp_feat["strand"], read_feat.location.strand) self.assertEqual(exp_feat["label"], read_feat.qualifiers["label"][0])
class Wheel: def __init__(self, path): self.path = path self.parsed_filename = parse_wheel_filename(os.path.basename(path)) self.dist_info = '{0.project}-{0.version}.dist-info'\ .format(self.parsed_filename) def __enter__(self): self.fp = open(self.path, 'rb') self.zipfile = ZipFile(self.fp) return self def __exit__(self, exc_type, exc_value, traceback): self.zipfile.close() self.fp.close() return False @cached_property def record(self): rec = self._get_dist_info('RECORD') if rec is None: raise errors.MissingRecordError() with self.zipfile.open(rec) as fp: # The csv module requires this file to be opened with `newline=''` return Record.load(io.TextIOWrapper(fp, 'utf-8', newline='')) def verify_record(self): # Check everything in RECORD against actual values: for entry in self.record: if entry: entry.verify(self.zipfile) elif entry.path != self.dist_info + '/RECORD': raise errors.NullEntryError(entry.path) # Check everything in zipfile appears in RECORD (except signatures and # directories): for path in self.zipfile.namelist(): if path not in self.record and path not in ( self.dist_info + '/RECORD.jws', self.dist_info + '/RECORD.p7s', ) and not path.endswith('/'): raise errors.ExtraFileError(path) @cached_property def metadata(self): rec = self._get_dist_info('METADATA') if rec is None: ### TODO: This should be an error return None with self.zipfile.open(rec) as fp: return parse_metadata(io.TextIOWrapper(fp, 'utf-8')) @cached_property def wheel_info(self): rec = self._get_dist_info('WHEEL') if rec is None: ### TODO: This should be an error return None with self.zipfile.open(rec) as fp: return parse_wheel_info(io.TextIOWrapper(fp, 'utf-8')) def _get_dist_info(self, filename): try: return self.zipfile.getinfo(self.dist_info + '/' + filename) except KeyError: return None def inspect(self): namebits = self.parsed_filename about = { "filename": os.path.basename(self.path), "project": namebits.project, "version": namebits.version, "buildver": namebits.build, "pyver": namebits.python_tags, "abi": namebits.abi_tags, "arch": namebits.platform_tags, } try: record = self.record except WheelValidationError as e: record = None about["valid"] = False about["validation_error"] = { "type": type(e).__name__, "str": str(e), } else: try: self.verify_record() except WheelValidationError as e: about["valid"] = False about["validation_error"] = { "type": type(e).__name__, "str": str(e), } else: about["valid"] = True about["file"] = {"size": os.path.getsize(self.path)} self.fp.seek(0) about["file"]["digests"] = digest_file(self.fp, ["md5", "sha256"]) about["dist_info"] = {} if self.metadata is not None: about["dist_info"]["metadata"] = self.metadata if record is not None: about["dist_info"]["record"] = record.for_json() if self.wheel_info is not None: about["dist_info"]["wheel"] = self.wheel_info for fname, parser, key in EXTRA_DIST_INFO_FILES: info = self._get_dist_info(fname) if info is not None: with self.zipfile.open(info) as fp: about["dist_info"][key] = parser( io.TextIOWrapper(fp, 'utf-8')) if self._get_dist_info('zip-safe') is not None: about["dist_info"]["zip_safe"] = True elif self._get_dist_info('not-zip-safe') is not None: about["dist_info"]["zip_safe"] = False md = about["dist_info"].get("metadata", {}) about["derived"] = { "description_in_body": "BODY" in md, "description_in_headers": "description" in md, } if "BODY" in md and "description" not in md: md["description"] = md["BODY"] md.pop("BODY", None) readme = md.get("description") if readme is not None: md["description"] = {"length": len(md["description"])} dct = md.get("description_content_type") if dct is None or parse_header(dct)[0] == 'text/x-rst': about["derived"]["readme_renders"] = render(readme) is not None else: about["derived"]["readme_renders"] = True else: about["derived"]["readme_renders"] = None if md.get("keywords") is not None: about["derived"]["keywords"], about["derived"]["keyword_separator"] \ = split_keywords(md["keywords"]) else: about["derived"]["keywords"], about["derived"]["keyword_separator"] \ = [], None about["derived"]["keywords"] = sorted(set( about["derived"]["keywords"])) about["derived"]["dependencies"] = sorted( unique_projects(req["name"] for req in md.get("requires_dist", []))) about["derived"]["modules"] = extract_modules( [rec["path"] for rec in about["dist_info"].get("record", [])]) return about
def build_check_requires_timestamp(t): from zipfile import ZipFile unused_count = 0 all_provides = set() zf = ZipFile(PLOVR_JAR) for zi in zf.infolist(): if zi.filename.endswith('.js'): if not zi.filename.startswith('closure/goog/'): continue # Skip goog.i18n because it contains so many modules that it causes # the generated regular expression to exceed Python's limits if zi.filename.startswith('closure/goog/i18n/'): continue for line in zf.open(zi, 'rU'): m = re.match(r'goog.provide\(\'(.*)\'\);', line) if m: all_provides.add(m.group(1)) for filename in sorted(t.dependencies): if filename == 'build/src/internal/src/requireall.js': continue require_linenos = {} uses = set() lines = open(filename, 'rU').readlines() for lineno, line in _strip_comments(lines): m = re.match(r'goog.provide\(\'(.*)\'\);', line) if m: all_provides.add(m.group(1)) continue m = re.match(r'goog.require\(\'(.*)\'\);', line) if m: require_linenos[m.group(1)] = lineno continue ignore_linenos = require_linenos.values() for lineno, line in enumerate(lines): if lineno in ignore_linenos: continue for require in require_linenos.iterkeys(): if require in line: uses.add(require) for require in sorted(set(require_linenos.keys()) - uses): t.info('%s:%d: unused goog.require: %r' % (filename, require_linenos[require], require)) unused_count += 1 all_provides.discard('ol') all_provides.discard('ol.MapProperty') class Node(object): def __init__(self): self.present = False self.children = {} def _build_re(self, key): if key == '*': assert len(self.children) == 0 # We want to match `.doIt` but not `.SomeClass` or `.more.stuff` return '(?=\\.[a-z]\\w*\\b(?!\\.))' elif len(self.children) == 1: child_key, child = next(self.children.iteritems()) child_re = child._build_re(child_key) if child_key != '*': child_re = '\\.' + child_re if self.present: return key + '(' + child_re + ')?' else: return key + child_re elif self.children: children_re = '(?:' + '|'.join( ('\\.' if k != '*' else '') + self.children[k]._build_re(k) for k in sorted(self.children.keys())) + ')' if self.present: return key + children_re + '?' else: return key + children_re else: assert self.present return key def build_re(self, key): return re.compile('\\b' + self._build_re(key) + '\\b') root = Node() for provide in all_provides: node = root for component in provide.split('.'): if component not in node.children: node.children[component] = Node() node = node.children[component] if component[0].islower(): # We've arrived at a namespace provide like `ol.foo`. # In this case, we want to match uses like `ol.foo.doIt()` but # not match things like `new ol.foo.SomeClass()`. # For this purpose, we use the special wildcard key for the child. node.children['*'] = Node() else: node.present = True provide_res = [ child.build_re(key) for key, child in root.children.iteritems() ] missing_count = 0 for filename in sorted(t.dependencies): if filename in INTERNAL_SRC or filename in EXTERNAL_SRC: continue provides = set() requires = set() uses = set() uses_linenos = {} for lineno, line in _strip_comments(open(filename, 'rU')): m = re.match(r'goog.provide\(\'(.*)\'\);', line) if m: provides.add(m.group(1)) continue m = re.match(r'goog.require\(\'(.*)\'\);', line) if m: requires.add(m.group(1)) continue while True: for provide_re in provide_res: m = provide_re.search(line) if m: uses.add(m.group()) uses_linenos[m.group()] = lineno line = line[:m.start()] + line[m.end():] break else: break if filename == 'src/ol/renderer/layerrenderer.js': uses.discard('ol.renderer.Map') m = re.match(r'src/ol/renderer/(\w+)/\1(\w*)layerrenderer\.js\Z', filename) if m: uses.discard('ol.renderer.Map') uses.discard('ol.renderer.%s.Map' % (m.group(1), )) missing_requires = uses - requires - provides if missing_requires: for missing_require in sorted(missing_requires): t.info( "%s:%d missing goog.require('%s')" % (filename, uses_linenos[missing_require], missing_require)) missing_count += 1 if unused_count or missing_count: t.error('%d unused goog.requires, %d missing goog.requires' % (unused_count, missing_count)) t.touch()
def proj4js(t): from zipfile import ZipFile zf = ZipFile(PROJ4JS_ZIP) contents = zf.open('proj4js/lib/proj4js-combined.js').read() with open(t.name, 'wb') as f: f.write(contents)
def fat_aar(distdir, aars_paths, no_process=False, no_compatibility_check=False): if no_process: print("Not processing architecture-specific artifact Maven AARs.") return 0 # Map {filename: {fingerprint: [arch1, arch2, ...]}}. diffs = defaultdict(lambda: defaultdict(list)) missing_arch_prefs = set() # Collect multi-architecture inputs to the fat AAR. copier = FileCopier() for arch, aar_path in aars_paths.items(): # Map old non-architecture-specific path to new architecture-specific path. old_rewrite_map = { "greprefs.js": "{}/greprefs.js".format(arch), "defaults/pref/geckoview-prefs.js": "defaults/pref/{}/geckoview-prefs.js".format(arch), } # Architecture-specific preferences files. arch_prefs = set(old_rewrite_map.values()) missing_arch_prefs |= set(arch_prefs) jar_finder = JarFinder(aar_path, JarReader(aar_path)) for path, fileobj in UnpackFinder(jar_finder): # Native libraries go straight through. if mozpath.match(path, "jni/**"): copier.add(path, fileobj) elif path in arch_prefs: copier.add(path, fileobj) elif path in ("classes.jar", "annotations.zip"): # annotations.zip differs due to timestamps, but the contents should not. # `JarReader` fails on the non-standard `classes.jar` produced by Gradle/aapt, # and it's not worth working around, so we use Python's zip functionality # instead. z = ZipFile(BytesIO(fileobj.open().read())) for r in z.namelist(): fingerprint = sha1(z.open(r).read()).hexdigest() diffs["{}!/{}".format(path, r)][fingerprint].append(arch) else: fingerprint = sha1(six.ensure_binary( fileobj.open().read())).hexdigest() # There's no need to distinguish `target.maven.zip` from `assets/omni.ja` here, # since in practice they will never overlap. diffs[path][fingerprint].append(arch) missing_arch_prefs.discard(path) # Some differences are allowed across the architecture-specific AARs. We could allow-list # the actual content, but it's not necessary right now. allow_pattern_list = { "AndroidManifest.xml", # Min SDK version is different for 32- and 64-bit builds. "classes.jar!/org/mozilla/gecko/util/HardwareUtils.class", # Min SDK as well. "classes.jar!/org/mozilla/geckoview/BuildConfig.class", # Each input captures its CPU architecture. "chrome/toolkit/content/global/buildconfig.html", # Bug 1556162: localized resources are not deterministic across # per-architecture builds triggered from the same push. "**/*.ftl", "**/*.dtd", "**/*.properties", } not_allowed = OrderedDict() def format_diffs(ds): # Like ' armeabi-v7a, arm64-v8a -> XXX\n x86, x86_64 -> YYY'. return "\n".join( sorted(" {archs} -> {fingerprint}".format( archs=", ".join(sorted(archs)), fingerprint=fingerprint) for fingerprint, archs in ds.items())) for p, ds in sorted(diffs.items()): if len(ds) <= 1: # Only one hash across all inputs: roll on. continue if any(mozpath.match(p, pat) for pat in allow_pattern_list): print( 'Allowed: Path "{path}" has architecture-specific versions:\n{ds_repr}' .format(path=p, ds_repr=format_diffs(ds))) continue not_allowed[p] = ds for p, ds in not_allowed.items(): print( 'Disallowed: Path "{path}" has architecture-specific versions:\n{ds_repr}' .format(path=p, ds_repr=format_diffs(ds))) for missing in sorted(missing_arch_prefs): print( "Disallowed: Inputs missing expected architecture-specific input: {missing}" .format(missing=missing)) if not no_compatibility_check and (missing_arch_prefs or not_allowed): return 1 output_dir = mozpath.join(distdir, "output") copier.copy(output_dir) return 0
def zip_extract(zip_file: zipfile.ZipFile, file_name: str, target_file_obj: IO): with zip_file.open(file_name) as fp: shutil.copyfileobj(fp, target_file_obj) target_file_obj.flush()
def open(self, *args, **kwargs): base = BaseZipFile.open(self, *args, **kwargs) return ZipExtFile(base)
os.chdir('/home/llq205/clean_feature1') zipfiles = glob('*zip') for zfname in zipfiles: print(zfname) zfile = ZipFile(zfname) year = zfname.split('/')[-1][:-4] members = zfile.namelist() year_count = Counter() for fname in members: # print(fname) if fname.endswith('-maj.p'): each = pickle.load(zfile.open(fname, 'r')) if each != [] and each[0] != Counter(): d = Counter({k: 1 for k, v in each[0].items()}) year_count += d elif fname.endswith('.p') == False: continue elif fname.endswith('dis/.p') == True: continue else: optype = fname.split('-')[-1][:-2] docid = fname.split('/')[-1][:-2] if len(optype) == 7: each = pickle.load(zfile.open(fname, 'r')) if each != [] and each[0] != Counter():
def run(args, tmp_dir): tmp_dir = Path(tmp_dir) base_dir = Path(args.dest) if not base_dir.exists(): base_dir.mkdir(parents=True) base_dir = base_dir.resolve() if Path(args.pack_path).exists(): pack_path = args.pack_path logger.info('Found local modpack {}'.format(pack_path)) else: pack_path = tmp_dir / 'modpack.zip' logger.info('Downloading the modpack to {} ...'.format(pack_path)) pack_path, _ = urlretrieve(args.pack_path, str(pack_path)) logger.info(' Done') modpack = ZipFile(pack_path) manifest = json.loads(modpack.open('manifest.json').read().decode('utf-8')) logger.info('Modpack: {name} (Version {version})'.format(**manifest)) if args.exclude: mod_blacklist = list(line.rstrip() for line in args.exclude) logger.debug('Mod blacklist: {}'.format(mod_blacklist)) args.exclude.close() else: mod_blacklist = None # Download the mod files mod_store = tmp_dir / 'mod_store' mod_store.mkdir() logger.info('Starting mod downloads, this may take a while') with ThreadPoolExecutor(args.threads) as executor: futures = [] for mod in manifest['files']: futures.append(executor.submit(download_mod, mod, mod_store, blacklist=mod_blacklist)) bonus = manifest.get('directDownload', []) for entry in bonus: url, filename = (entry.get(x) for x in ('url', 'filename')) if url is None or filename is None: logger.warning('Error while handling entry {}'.format(entry), file=sys.stderr) continue futures.append(executor.submit(download, url, mod_store, filename)) # Re-raise the exceptions which might have happened for f in as_completed(futures): e = f.exception() if e: for g in futures: g.cancel() if isinstance(e, urllib.error.HTTPError): logger.error('Error while fetching {}'.format(e.url)) f.result() logger.info(' Done') # Backup some config subdirs = {d: base_dir / d for d in ('mods', 'config')} backups = {k: d.with_suffix('.bak') for k, d in subdirs.items()} for k, d in subdirs.items(): if d.exists(): b = backups[k] if b.exists(): shutil.rmtree(str(b)) d.replace(b) d.mkdir() # Update Forge if not args.keep_forge: mc_spec = manifest.get('minecraft', {}) mc_version = mc_spec.get('version') forge_ids = [x['id'].replace('forge-', '') for x in mc_spec.get('modLoaders', []) if x.get('id', '').startswith('forge-')] if mc_version and forge_ids: update_forge(mc_version, forge_ids[0], tmp_dir, base_dir) else: logger.warning('Could not extract Forge informations from the manifest') logger.debug('minecraft : {}\nmodLoaders : {}'.format(manifest.get('minecraft'), manifest.get('modLoaders'))) # Install mod files logger.info('Installing mods...') copytree(str(mod_store), str(subdirs['mods'])) # Apply ovverides logger.info('Applying custom config...') overrides = manifest.get('overrides') if overrides is not None: overrides = Path(overrides) todo = [entry for entry in modpack.namelist() if Path(entry) > overrides] modpack.extractall(str(tmp_dir), todo) copytree(str(tmp_dir / overrides), str(base_dir)) if args.keep_config and backups['config'].exists(): copytree(str(backups['config']), str(subdirs['config'])) logger.info('Modpack {name} successfully installed'.format(**manifest))
import pandas as pd import plotly.graph_objects as go # plotly 4.14.1 from plotly.subplots import make_subplots import dash # print(dash.__version__) (version 1.18.0) pip install dash import dash_core_components as dcc import dash_html_components as html from dash.dependencies import Input, Output import dash_bootstrap_components as dbc # version 0.11.1 #---------------------------------------------- # Importing Google Mobility data #---------------------------------------------- url = "https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip" filename = requests.get(url).content zf = ZipFile(BytesIO(filename), 'r') df = pd.read_csv(zf.open('2020_DE_Region_Mobility_Report.csv'), usecols=[ 'sub_region_1', 'date', 'retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline' ]) # Modifying the name of the columns df.rename(columns={ "sub_region_1": "state", "retail_and_recreation_percent_change_from_baseline": "retil_creat", "grocery_and_pharmacy_percent_change_from_baseline": "groce_pharma",
def form_valid(self, form): # This method is called when valid form data has been POSTed. # It should return an HttpResponse. # TODO parse the kml file more smartly to locate the first placemark and work from there. kml_file_upload = self.request.FILES[ 'kmlfileUpload'] # get a handle on the file kml_file_upload_name = self.request.FILES[ 'kmlfileUpload'].name # get the file name # kml_file_name = kml_file_upload_name[:kml_file_upload_name.rfind('.')] # get the file name no extension kml_file_extension = kml_file_upload_name[ kml_file_upload_name.rfind('.') + 1:] # get the file extension kml_file_path = os.path.join(settings.MEDIA_ROOT) # Define a routine for importing Placemarks from a list of placemark elements def import_placemarks(kml_placemark_list): """ A procedure that reads a KML placemark list and saves the data into the django database :param kml_placemark_list: :return: """ occurrence_count, archaeology_count, biology_count, geology_count = [ 0, 0, 0, 0 ] Occurrence.objects.all().update( last_import=False) # Toggle off all last imports for o in kml_placemark_list: # Check to make sure that the object is a Placemark, filter out folder objects if type(o) is Placemark: # Step 1 - parse the xml and copy placemark attributes to a dictionary table = etree.fromstring( o.description ) # get the table element with all the data from the xml. attributes = table.xpath( "//text()|//img" ) # get all text values and image tags from xml string # TODO test attributes is even length # Create a diction ary from the attribute list. The list has key value pairs as alternating # elements in the list, the line below takes the first and every other elements and adds them # as keys, then the second and every other element and adds them as values. # e.g. # attributes[0::2] = ["Basis of Record", "Time", "Item Type" ...] # attributes[1::2] = ["Collection", "May 27, 2017, 10:12 AM", "Faunal" ...] # zip creates a list of tuples = [("Basis of Record", "Collection), ...] # which is converted to a dictionary. attributes_dict = dict( zip(attributes[0::2], attributes[1::2])) # Step 2 - Create a new Occurrence object (or subtype) lgrp_occ = None # Determine the appropriate subtype and initialize item_type = attributes_dict.get("Item Type") occurrence_count += 1 if item_type in ("Artifact", "Artifactual", "Archeology", "Archaeological"): lgrp_occ = Archaeology() archaeology_count += 1 elif item_type in ("Faunal", "Fauna", "Floral", "Flora"): lgrp_occ = Biology() biology_count += 1 elif item_type in ("Geological", "Geology"): lgrp_occ = Geology() geology_count += 1 # Step 3 - Copy attributes from dictionary to Occurrence object, validate as we go. # Improve by checking each field to see if it has a choice list. If so validate against choice # list. # Verbatim Data - save a verbatim copy of the original kml placemark attributes. lgrp_occ.verbatim_kml_data = attributes # Validate Basis of Record if attributes_dict.get("Basis Of Record") in ( "Fossil", "FossilSpecimen", "Collection"): lgrp_occ.basis_of_record = "Collection" elif attributes_dict.get("Basis Of Record") in ( "Observation", "HumanObservation"): lgrp_occ.basis_of_record = "Observation" # Validate Item Type item_type = attributes_dict.get("Item Type") if item_type in ("Artifact", "Artifactual", "Archeology", "Archaeological"): lgrp_occ.item_type = "Artifactual" elif item_type in ("Faunal", "Fauna"): lgrp_occ.item_type = "Faunal" elif item_type in ("Floral", "Flora"): lgrp_occ.item_type = "Floral" elif item_type in ("Geological", "Geology"): lgrp_occ.item_type = "Geological" # Date Recorded try: # parse the time lgrp_occ.date_recorded = parse( attributes_dict.get("Time")) # set the year collected form field number lgrp_occ.year_collected = lgrp_occ.date_recorded.year except ValueError: # If there's a problem getting the fieldnumber, use the current date time and set the # problem flag to True. lgrp_occ.date_recorded = datetime.now() lgrp_occ.problem = True try: error_string = "Upload error, missing field number, using current date and time instead." lgrp_occ.problem_comment = lgrp_occ.problem_comment + " " + error_string except TypeError: lgrp_occ.problem_comment = error_string # Process point, comes in as well known text string # Assuming point is in GCS WGS84 datum = SRID 4326 pnt = GEOSGeometry("POINT (" + str(o.geometry.x) + " " + str(o.geometry.y) + ")", 4326) # WKT lgrp_occ.geom = pnt scientific_name_string = attributes_dict.get( "Scientific Name") lgrp_occ.item_scientific_name = scientific_name_string if lgrp_occ.item_scientific_name: match, match_count, match_list = match_taxon(lgrp_occ) if match and match_count == 1: lgrp_occ.taxon = match_list[0] lgrp_occ.item_description = attributes_dict.get( "Description") if lgrp_occ.item_description: match, match_count, match_list = match_element( lgrp_occ) if match and match_count == 1: lgrp_occ.element = lgrp_occ.item_description.lower( ) ####################### # NON-REQUIRED FIELDS # ####################### lgrp_occ.barcode = attributes_dict.get("Barcode") lgrp_occ.item_number = lgrp_occ.barcode lgrp_occ.collection_remarks = attributes_dict.get( "Collecting Remarks") lgrp_occ.geology_remarks = attributes_dict.get( "Geology Remarks") lgrp_occ.collecting_method = attributes_dict.get( "Collection Method") finder_string = attributes_dict.get("Finder") lgrp_occ.finder = finder_string # import person object, validated against look up data in Person table lgrp_occ.finder_person, created = Person.objects.get_or_create( name=finder_string) collector_string = attributes_dict.get("Collector") lgrp_occ.collector = collector_string # import person object, validated against look up data in Person table lgrp_occ.collector_person, created = Person.objects.get_or_create( name=collector_string) lgrp_occ.individual_count = attributes_dict.get("Count") if attributes_dict.get("In Situ") in ('No', "NO", 'no'): lgrp_occ.in_situ = False elif attributes_dict.get("In Situ") in ('Yes', "YES", 'yes'): lgrp_occ.in_situ = True if attributes_dict.get("Ranked Unit") in ('No', "NO", 'no'): lgrp_occ.ranked = False elif attributes_dict.get("Ranked Unit") in ('Yes', "YES", 'yes'): lgrp_occ.ranked = True unit_found_string = attributes_dict.get("Unit Found") unit_likely_string = attributes_dict.get("Unit Likely") lgrp_occ.analytical_unit_found = unit_found_string lgrp_occ.analytical_unit_likely = unit_likely_string lgrp_occ.analytical_unit_1 = attributes_dict.get("Unit 1") lgrp_occ.analytical_unit_2 = attributes_dict.get("Unit 2") lgrp_occ.analytical_unit_3 = attributes_dict.get("Unit 3") # import statigraphy object, validate against look up data in Stratigraphy table lgrp_occ.unit_found, created = StratigraphicUnit.objects.get_or_create( name=unit_found_string) lgrp_occ.unit_likly, created = StratigraphicUnit.objects.get_or_create( name=unit_likely_string) # Save Occurrence before saving media. Need id to rename media files lgrp_occ.last_import = True lgrp_occ.save() # Save image if kml_file_extension.lower() == "kmz": # grab image names from XML image_names = table.xpath("//img/@src") # grab the name of the first image # Future: add functionality to import multiple images if image_names and len( image_names ) == 1: # This will break if image_names is None image_name = image_names[0] # Check that the image name is in the kmz file list kmz_file.filenames = [ f.orig_filename for f in kmz_file.filelist ] if image_name in kmz_file.filenames: # etch the kmz image file object, this is a ZipInfo object not a File object image_file_obj = next( f for f in kmz_file.filelist if f.orig_filename == image_name) # fetch the upload directory from the model definition upload_dir = Biology._meta.get_field( 'image').upload_to # update image name to include upload path and occurrence id # e.g. /uploads/images/lgrp/14775_188.jpg new_image_name = os.path.join( upload_dir, str(lgrp_occ.id) + '_' + image_name) # Save the image lgrp_occ.image.save( new_image_name, ContentFile(kmz_file.read(image_file_obj))) elif type(o) is not Placemark: raise IOError("KML File is badly formatted") if occurrence_count == 1: message_string = '1 occurrence' if occurrence_count > 1: message_string = '{} occurrences'.format(occurrence_count) messages.add_message( self.request, messages.INFO, 'Successfully imported {} occurrences'.format(message_string)) kml_file = kml.KML() if kml_file_extension == "kmz": kmz_file = ZipFile(kml_file_upload, 'r') kml_document = kmz_file.open('doc.kml', 'r').read() else: # read() loads entire file as one string kml_document = open(kml_file_path + "/" + kml_file_upload_name, 'r').read() kml_file.from_string( kml_document ) # pass contents of kml string to kml document instance for parsing # get the top level features object (this is essentially the layers list) level1_elements = list(kml_file.features()) # Check that the kml file is well-formed with a single document element. if len(level1_elements) == 1 and type(level1_elements[0]) == Document: document = level1_elements[0] # If well-formed document, check if the file has folders, which correspond to layers level2_elements = list(document.features()) if len(level2_elements) == 1 and type( level2_elements[0]) == Folder: folder = level2_elements[0] # If a single folder is present import placemarks from that folder # Get features from the folder level3_elements = list(folder.features()) # Check that the features are Placemarks. If they are, import them if len(level3_elements) >= 1 and type( level3_elements[0]) == Placemark: placemark_list = level3_elements import_placemarks(placemark_list) elif len(level2_elements) >= 1 and type( level2_elements[0]) == Placemark: placemark_list = level2_elements import_placemarks(placemark_list) return super(ImportKMZ, self).form_valid(form)
def output(self): ''' Generate SRTM data wrapper @return SRTM Image Wrapper ''' lat_tile_array = np.arange(self.lat_tile_start, self.lat_tile_end + 1) lon_tile_array = np.arange(self.lon_tile_start, self.lon_tile_end + 1) lat_grid, lon_grid = np.meshgrid(lat_tile_array, lon_tile_array) lat_grid = lat_grid.ravel() lon_grid = lon_grid.ravel() filename_root = '.SRTMGL1.' base_url = 'https://e4ftl01.cr.usgs.gov/MEASURES/' folder_root = 'SRTMGL1.003/2000.02.11/' if self.arcsecond_sampling == 3: filename_root = '.SRTMGL3.' folder_root = 'SRTMGL3.003/2000.02.11/' base_url += folder_root filename_list = [] for lat, lon in zip(lat_grid, lon_grid): if lat < 0: lat_label = 'S' lat = np.abs(lat) else: lat_label = 'N' if lon < 0: lon_label = 'W' lon = np.abs(lon) else: lon_label = 'E' filename_list.append(lat_label + convertToStr(lat, 2) + lon_label + convertToStr(lon, 3) + filename_root + 'hgt.zip') if self.mask_water == True: filename_list.append(lat_label + convertToStr(lat, 2) + lon_label + convertToStr(lon, 3) + filename_root + 'num.zip') # Read in list of available data srtm_list_filename = 'srtm_gl1.txt' if self.arcsecond_sampling == 3: srtm_list_filename = 'srtm_gl3.txt' srtm_support_filename = resource_filename( 'skdaccess', os.path.join('support', srtm_list_filename)) available_file_list = open(srtm_support_filename).readlines() available_file_list = [ filename.strip() for filename in available_file_list ] requested_files = pd.DataFrame({'Filename': filename_list}) requested_files['Valid'] = [ '.'.join(filename.split('.')[0:-2]) in available_file_list for filename in filename_list ] valid_filename_list = requested_files.loc[requested_files['Valid'] == True, 'Filename'].tolist() url_list = [base_url + filename for filename in valid_filename_list] downloaded_file_list = self.cacheData( 'srtm', url_list, self.username, self.password, 'https://urs.earthdata.nasa.gov') requested_files.loc[requested_files['Valid'] == True, 'Full Path'] = downloaded_file_list def getCoordinates(filename): ''' Determine the longitude and latitude of the lowerleft corner of the input filename @param in_filename: Input SRTM filename @return Latitude of southwest corner, Longitude of southwest corner ''' lat_start = int(filename[1:3]) if filename[0] == 'S': lat_start *= -1 lon_start = int(filename[4:7]) if filename[3] == 'W': lon_start *= -1 return lat_start, lon_start data_dict = OrderedDict() metadata_dict = OrderedDict() array_shape = (3601, 3601) if self.arcsecond_sampling == 3: array_shape = (1201, 1201) file_slice = slice(None) water_value = 0 if self.mask_water == True: file_slice = slice(0, -1, 2) water_value = np.nan for i in requested_files.index[file_slice]: hgt_full_path = requested_files.at[i, 'Full Path'] hgt_filename = requested_files.at[i, 'Filename'] label = hgt_filename[:7] lat_start, lon_start = getCoordinates(hgt_filename) metadata_dict[label] = OrderedDict() x_res = 1.0 / (array_shape[0] - 1) y_res = 1.0 / (array_shape[1] - 1) extents = [ lon_start - x_res / 2, lon_start + 1 + x_res / 2, lat_start - y_res / 2, lat_start + 1 + y_res / 2 ] if requested_files.at[i, 'Valid']: masked_dem_data = np.ones(array_shape) if self.mask_water == True and requested_files.at[i + 1, 'Valid']: num_full_path = requested_files.at[i + 1, 'Full Path'] num_filename = requested_files.at[i + 1, 'Full Path'] zipped_num_data = ZipFile(num_full_path) zipped_num_full_path = zipped_num_data.infolist( )[0].filename num_data = np.frombuffer( zipped_num_data.open(zipped_num_full_path).read(), np.dtype('uint8')).reshape(array_shape) masked_dem_data[(num_data == 1) | (num_data == 2)] = water_value i += 1 zipped_hgt_data = ZipFile(hgt_full_path) dem_dataset = gdal.Open(hgt_full_path, gdal.GA_ReadOnly) dem_data = dem_dataset.ReadAsArray() masked_dem_data *= dem_data metadata_dict[label]['WKT'] = dem_dataset.GetProjection() metadata_dict[label][ 'GeoTransform'] = dem_dataset.GetGeoTransform() else: geo_transform = [] geo_transform.append(extents[0]) geo_transform.append(x_res) geo_transform.append(0) geo_transform.append(extents[-1]) geo_transform.append(0) geo_transform.append(-y_res) metadata_dict[label]['WKT'] = self._missing_data_projection metadata_dict[label]['GeoTransform'] = geo_transform masked_dem_data = np.full(shape=array_shape, fill_value=water_value) i += 1 data_dict[label] = masked_dem_data metadata_dict[label]['Geolocation'] = AffineGlobalCoords( metadata_dict[label]['GeoTransform'], center_pixels=True) metadata_dict[label]['extents'] = extents if self.store_geolocation_grids: lat_coords, lon_coords = np.meshgrid( np.linspace(lat_start + 1, lat_start, array_shape[0]), np.linspace(lon_start, lon_start + 1, array_shape[1]), indexing='ij') metadata_dict[label]['Latitude'] = lat_coords metadata_dict[label]['Longitude'] = lon_coords return ImageWrapper(obj_wrap=data_dict, meta_data=metadata_dict)
#sys.exit(1) # dxf file input #--------------- if args.dxf is not None: print("DIPPY Tool: DXF input file: " + args.dxf) dxfile = [] if args.dxf.endswith(".zip"): myzip = ZipFile(args.dxf, 'r') for sl in myzip.namelist(): if sl.find("negative") < 0: dxfile.append(myzip.open(sl, 'r')) else: dxfile.append(open(args.dxf, 'r')) if len(dxfile) > 0: for s in dxfile: dxf15parse = ET.XMLParser(remove_blank_text=True) doc = ET.parse(s, dxf15parse) if opcode == 'addexpt': if args.acf in ['DEFAULT', 'True']: print("Auto-create mode") dpa = DP.ExptSubmitter() dpa.addExpt(doc) else:
def import_tree(id): def import_region(feature): def extract_data(properties): result = {'level': properties['admin_level']} fields = ['boundary', 'ISO3166-1:alpha3', 'timezone'] for field in fields: result[field] = properties['tags'].get(field, None) return result print(feature['properties']['name']) parent = None if len(feature['rpath']) > 2: # sometimes they are swapped parent_id = feature['rpath'][1] if int(feature['rpath'][0]) == feature['id'] else \ feature['rpath'][0] parent = Region.objects.get(osm_id=parent_id) region = Region.objects.create( title=feature['properties']['name'], polygon=GEOSGeometry(json.dumps(feature['geometry'])), parent=parent, wikidata_id=feature['properties']['tags'].get('wikidata'), osm_id=feature['id'], osm_data=extract_data(feature['properties']) ) for lang in ('en', 'ru'): trans = region.load_translation(lang) trans.master = region trans.name = region.title trans.save() zip_file = os.path.join(settings.GEOJSON_DIR, '{}.zip'.format(id)) if not os.path.exists(zip_file): url = settings.OSM_URL.format(id=id, key=settings.OSM_KEY) print(url) response = requests.get(url, stream=True) if response.status_code != 200: raise Exception('Bad request') with open(zip_file, 'wb') as out_file: response.raw.decode_content = True shutil.copyfileobj(response.raw, out_file) zipfile = ZipFile(zip_file) zip_names = zipfile.namelist() for zip_name in zip_names: print(zip_name) # if zip_name.endswith('AL2.GeoJson') or zip_name.endswith('AL3.GeoJson') or zip_name.endswith('AL4.GeoJson'): # if not zip_name.endswith('AL6.GeoJson'): # continue level = json.loads(zipfile.open(zip_name).read().decode()) not_passed = [] for feature in level['features']: try: if not Region.objects.filter(osm_id=feature['id']).exists(): import_region(feature) except Region.DoesNotExist: not_passed.append(feature) continue while len(not_passed) > 0: bad_passed = [] for feature in not_passed: try: import_region(feature) except Region.DoesNotExist: bad_passed.append(feature) continue if not_passed == bad_passed: print('Circular references') break not_passed = bad_passed
# This dataset comes from the [UCI Machine Learning Data Repository](https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data). It includes data on air pollutants and weather from 12 sites. To simplify the example, we'll focus on weekly averages for two measures: PM10 and SO2. Since these measures are strictly positive, we log-transform them. # + {"tags": ["remove_cell"]} # Read in data try: df_aq = pd.read_csv("./PRSA2017_Data_20130301-20170228.csv") except FileNotFoundError: import requests from zipfile import ZipFile from io import BytesIO response =\ requests.get('http://archive.ics.uci.edu/ml/machine-learning-databases/00501/PRSA2017_Data_20130301-20170228.zip') zip_file = ZipFile(BytesIO(response.content)) files = zip_file.namelist() df_aq = pd.concat( [pd.read_csv(zip_file.open(f)) for f in files if f.endswith('csv')]) df_aq.to_csv("./PRSA2017_Data_20130301-20170228.csv", index=False) df_aq['time'] = pd.to_datetime(df_aq.loc[:, ['year', 'month', 'day', 'hour']]) df_aq = df_aq.rename(columns={'PM2.5': 'PM2p5'}) df_aq_weekly = df_aq.\ assign(date= lambda df: df['time'].astype('datetime64[D]') - pd.to_timedelta(df['time'].dt.dayofweek, unit='d')).\ drop(columns= ['year','month','day','hour']).\ groupby(['date','station']).\ agg('mean').\ reset_index().\ sort_values(['station','date']).\ reset_index() # for training/validation split
def read_single_sheet(path, name=None): """ Read an xlsx, csv or tsv from a zipfile or directory """ from zipfile import ZipFile from . import xlreader if name is None: root, ext = os.path.splitext(path) stream = open(path, 'r') if ext == '.xlsx': return read_xl(stream) if ext == '.tsv': return read_csv(stream, dialect='excel-tab') if ext == '.csv': return read_csv(stream) if ext == '.json': return read_json(stream) raise ValueError('Unknown file extension for %r' % path) if path.endswith('.xlsx'): return cast_row_values( xlreader.DictReader(open(path, 'rb'), sheetname=name)) if path.endswith('.zip'): zf = ZipFile(path) names = zf.namelist() if (name + '.xlsx') in names: stream = zf.open(name + '.xlsx', 'r') return read_xl(stream) if (name + '.tsv') in names: stream = zf.open(name + '.tsv', 'rU') return read_csv(stream, dialect='excel-tab') if (name + '.csv') in names: stream = zf.open(name + '.csv', 'rU') return read_csv(stream) if (name + '.json') in names: stream = zf.open(name + '.json', 'r') return read_json(stream) if os.path.isdir(path): root = os.path.join(path, name) if os.path.exists(root + '.xlsx'): stream = open(root + '.xlsx', 'rb') return read_xl(stream) if os.path.exists(root + '.tsv'): stream = open(root + '.tsv', 'rU') return read_csv(stream, dialect='excel-tab') if os.path.exists(root + '.csv'): stream = open(root + '.csv', 'rU') return read_csv(stream) if os.path.exists(root + '.json'): stream = open(root + '.json', 'r') return read_json(stream) return []
""" Convert the scraped data from Hoboken into the same format as the Yelp Open Dataset data BIA660D - Group 1: Alec Kulakowski """ # Navigate into the correct project directory import os # os.listdir() os.chdir('../BIA660D_Group_1_Project') # Extract the review data from its .zip form from zipfile import ZipFile import pandas as pd zf = ZipFile("Hoboken_restaurants_reviews.csv.zip") raw = pd.read_csv(zf.open('Hoboken_restaurants_reviews.csv')) validation = raw.copy() validation = validation.drop(columns=validation.columns.values[0:2]) # Drop index columns # Convert user_ratings from string to integer and restaurant ratings from string to float validation['user_rating'] = validation['user_rating'].apply(lambda x: int(x[0])) validation['restaurant_rating'] = validation['restaurant_rating'].apply(lambda x: float(x[0:3])) # Display price distribution print(validation['restaurant_price'].value_counts()) # Display number of absent prices print('Missing prices: '+str(validation['restaurant_price'].isnull().sum())) # Replace missing values with mean and convert to integer def try_convert(x, y=0): try: return(x.count('$')) except: return(y) average_price = sum(validation['restaurant_price'].apply(lambda x: try_convert(x))) / validation['restaurant_price'].value_counts().sum() validation['restaurant_price'] = validation['restaurant_price'].apply(lambda x: try_convert(x, y=average_price)) # Separate Restaurant Type # from sklearn.feature_extraction import DictVectorizer # dv = DictVectorizer(sparse=False) # dv.fit_transform()
def find_ole(filename, data): """ try to open somehow as zip/ole/rtf/... ; yield None if fail If data is given, filename is (mostly) ignored. yields embedded ole streams in form of OleFileIO. """ if data is not None: # isOleFile and is_ppt can work on data directly but zip need file # --> wrap data in a file-like object without copying data log.debug('working on data, file is not touched below') arg_for_ole = data arg_for_zip = FakeFile(data) else: # we only have a file name log.debug('working on file by name') arg_for_ole = filename arg_for_zip = filename ole = None try: if olefile.isOleFile(arg_for_ole): if is_ppt(arg_for_ole): log.info('is ppt file: ' + filename) for ole in find_ole_in_ppt(arg_for_ole): yield ole ole = None # is closed in find_ole_in_ppt # in any case: check for embedded stuff in non-sectored streams log.info('is ole file: ' + filename) ole = olefile.OleFileIO(arg_for_ole) yield ole elif is_zipfile(arg_for_zip): log.info('is zip file: ' + filename) zipper = ZipFile(arg_for_zip, 'r') for subfile in zipper.namelist(): head = b'' try: with zipper.open(subfile) as file_handle: head = file_handle.read(len(olefile.MAGIC)) except RuntimeError: log.error('zip is encrypted: ' + filename) yield None continue if head == olefile.MAGIC: log.info(' unzipping ole: ' + subfile) with ZipSubFile(zipper, subfile) as file_handle: try: ole = olefile.OleFileIO(file_handle) yield ole except IOError: log.warning('Error reading data from {0}/{1} or ' 'interpreting it as OLE object' .format(filename, subfile)) log.debug('', exc_info=True) finally: if ole is not None: ole.close() ole = None else: log.debug('unzip skip: ' + subfile) else: log.warning('open failed: {0} (or its data) is neither zip nor OLE' .format(filename)) yield None except Exception: log.error('Caught exception opening {0}'.format(filename), exc_info=True) yield None finally: if ole is not None: ole.close()
class MailMerge(object): def __init__(self, file, remove_empty_tables=False): self.zip = ZipFile(file) self.parts = {} self.settings = None self._settings_info = None self.remove_empty_tables = remove_empty_tables try: content_types = etree.parse(self.zip.open('[Content_Types].xml')) for file in content_types.findall('{%(ct)s}Override' % NAMESPACES): type = file.attrib['ContentType' % NAMESPACES] if type in CONTENT_TYPES_PARTS: zi, self.parts[zi] = self.__get_tree_of_file(file) elif type == CONTENT_TYPE_SETTINGS: self._settings_info, self.settings = self.__get_tree_of_file(file) to_delete = [] for part in self.parts.values(): for parent in part.findall('.//{%(w)s}fldSimple/..' % NAMESPACES): for idx, child in enumerate(parent): if child.tag != '{%(w)s}fldSimple' % NAMESPACES: continue instr = child.attrib['{%(w)s}instr' % NAMESPACES] name = self.__parse_instr(instr) if name is None: continue parent[idx] = Element('MergeField', name=name) for parent in part.findall('.//{%(w)s}instrText/../..' % NAMESPACES): children = list(parent) fields = zip( [children.index(e) for e in parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="begin"]/..' % NAMESPACES)], [children.index(e) for e in parent.findall('{%(w)s}r/{%(w)s}fldChar[@{%(w)s}fldCharType="end"]/..' % NAMESPACES)] ) for idx_begin, idx_end in fields: # consolidate all instrText nodes between'begin' and 'end' into a single node begin = children[idx_begin] instr_elements = [e for e in begin.getparent().findall('{%(w)s}r/{%(w)s}instrText' % NAMESPACES) if idx_begin < children.index(e.getparent()) < idx_end] if len(instr_elements) == 0: continue # set the text of the first instrText element to the concatenation # of all the instrText element texts instr_text = ''.join([e.text for e in instr_elements]) instr_elements[0].text = instr_text # delete all instrText elements except the first for instr in instr_elements[1:]: instr.getparent().remove(instr) name = self.__parse_instr(instr_text) if name is None: continue parent[idx_begin] = Element('MergeField', name=name) # use this so we know *where* to put the replacement instr_elements[0].tag = 'MergeText' block = instr_elements[0].getparent() # append the other tags in the w:r block too parent[idx_begin].extend(list(block)) to_delete += [(parent, parent[i + 1]) for i in range(idx_begin, idx_end)] for parent, child in to_delete: parent.remove(child) # Remove mail merge settings to avoid error messages when opening document in Winword if self.settings: settings_root = self.settings.getroot() mail_merge = settings_root.find('{%(w)s}mailMerge' % NAMESPACES) if mail_merge is not None: settings_root.remove(mail_merge) except: self.zip.close() raise @classmethod def __parse_instr(cls, instr): args = shlex.split(instr, posix=False) if args[0] != 'MERGEFIELD': return None name = args[1] if name[0] == '"' and name[-1] == '"': name = name[1:-1] return name def __get_tree_of_file(self, file): fn = file.attrib['PartName' % NAMESPACES].split('/', 1)[1] zi = self.zip.getinfo(fn) return zi, etree.parse(self.zip.open(zi)) def write(self, file): # Replace all remaining merge fields with empty values for field in self.get_merge_fields(): self.merge(**{field: ''}) with ZipFile(file, 'w', ZIP_DEFLATED) as output: for zi in self.zip.filelist: if zi in self.parts: xml = etree.tostring(self.parts[zi].getroot()) output.writestr(zi.filename, xml) elif zi == self._settings_info: xml = etree.tostring(self.settings.getroot()) output.writestr(zi.filename, xml) else: output.writestr(zi.filename, self.zip.read(zi)) def get_merge_fields(self, parts=None): if not parts: parts = self.parts.values() fields = set() for part in parts: for mf in part.findall('.//MergeField'): fields.add(mf.attrib['name']) return fields def merge_templates(self, replacements, separator): """ Duplicate template. Creates a copy of the template, does a merge, and separates them by a new paragraph, a new break or a new section break. separator must be : - page_break : Page Break. - column_break : Column Break. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS - textWrapping_break : Line Break. - continuous_section : Continuous section break. Begins the section on the next paragraph. - evenPage_section : evenPage section break. section begins on the next even-numbered page, leaving the next odd page blank if necessary. - nextColumn_section : nextColumn section break. section begins on the following column on the page. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS - nextPage_section : nextPage section break. section begins on the following page. - oddPage_section : oddPage section break. section begins on the next odd-numbered page, leaving the next even page blank if necessary. """ #TYPE PARAM CONTROL AND SPLIT valid_separators = {'page_break', 'column_break', 'textWrapping_break', 'continuous_section', 'evenPage_section', 'nextColumn_section', 'nextPage_section', 'oddPage_section'} if not separator in valid_separators: raise ValueError("Invalid separator argument") type, sepClass = separator.split("_") #GET ROOT - WORK WITH DOCUMENT for part in self.parts.values(): root = part.getroot() tag = root.tag if tag == '{%(w)s}ftr' % NAMESPACES or tag == '{%(w)s}hdr' % NAMESPACES: continue if sepClass == 'section': #FINDING FIRST SECTION OF THE DOCUMENT firstSection = root.find("w:body/w:p/w:pPr/w:sectPr", namespaces=NAMESPACES) if firstSection == None: firstSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES) #MODIFY TYPE ATTRIBUTE OF FIRST SECTION FOR MERGING nextPageSec = deepcopy(firstSection) for child in nextPageSec: #Delete old type if exist if child.tag == '{%(w)s}type' % NAMESPACES: nextPageSec.remove(child) #Create new type (def parameter) newType = etree.SubElement(nextPageSec, '{%(w)s}type' % NAMESPACES) newType.set('{%(w)s}val' % NAMESPACES, type) #REPLACING FIRST SECTION secRoot = firstSection.getparent() secRoot.replace(firstSection, nextPageSec) #FINDING LAST SECTION OF THE DOCUMENT lastSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES) #SAVING LAST SECTION mainSection = deepcopy(lastSection) lsecRoot = lastSection.getparent() lsecRoot.remove(lastSection) #COPY CHILDREN ELEMENTS OF BODY IN A LIST childrenList = root.findall('w:body/*', namespaces=NAMESPACES) #DELETE ALL CHILDREN OF BODY for child in root: if child.tag == '{%(w)s}body' % NAMESPACES: child.clear() #REFILL BODY AND MERGE DOCS - ADD LAST SECTION ENCAPSULATED OR NOT lr = len(replacements) lc = len(childrenList) for i, repl in enumerate(replacements): parts = [] for (j, n) in enumerate(childrenList): element = deepcopy(n) for child in root: if child.tag == '{%(w)s}body' % NAMESPACES: child.append(element) parts.append(element) if (j + 1) == lc: if (i + 1) == lr: child.append(mainSection) parts.append(mainSection) else: if sepClass == 'section': intSection = deepcopy(mainSection) p = etree.SubElement(child, '{%(w)s}p' % NAMESPACES) pPr = etree.SubElement(p, '{%(w)s}pPr' % NAMESPACES) pPr.append(intSection) parts.append(p) elif sepClass == 'break': pb = etree.SubElement(child, '{%(w)s}p' % NAMESPACES) r = etree.SubElement(pb, '{%(w)s}r' % NAMESPACES) nbreak = Element('{%(w)s}br' % NAMESPACES) nbreak.attrib['{%(w)s}type' % NAMESPACES] = type r.append(nbreak) self.merge(parts, **repl) def merge_pages(self, replacements): """ Deprecated method. """ warnings.warn("merge_pages has been deprecated in favour of merge_templates", category=DeprecationWarning, stacklevel=2) self.merge_templates(replacements, "page_break") def merge(self, parts=None, **replacements): if not parts: parts = self.parts.values() for field, replacement in replacements.items(): if isinstance(replacement, list): self.merge_rows(field, replacement) else: for part in parts: self.__merge_field(part, field, replacement) def __merge_field(self, part, field, text): for mf in part.findall('.//MergeField[@name="%s"]' % field): children = list(mf) mf.clear() # clear away the attributes mf.tag = '{%(w)s}r' % NAMESPACES mf.extend(children) nodes = [] # preserve new lines in replacement text text = text or '' # text might be None text_parts = str(text).replace('\r', '').split('\n') for i, text_part in enumerate(text_parts): text_node = Element('{%(w)s}t' % NAMESPACES) text_node.text = text_part nodes.append(text_node) # if not last node add new line node if i < (len(text_parts) - 1): nodes.append(Element('{%(w)s}br' % NAMESPACES)) ph = mf.find('MergeText') if ph is not None: # add text nodes at the exact position where # MergeText was found index = mf.index(ph) for node in reversed(nodes): mf.insert(index, node) mf.remove(ph) else: mf.extend(nodes) def merge_rows(self, anchor, rows): table, idx, template = self.__find_row_anchor(anchor) if table is not None: if len(rows) > 0: del table[idx] for i, row_data in enumerate(rows): row = deepcopy(template) self.merge([row], **row_data) table.insert(idx + i, row) else: # if there is no data for a given table # we check whether table needs to be removed if self.remove_empty_tables: parent = table.getparent() parent.remove(table) def __find_row_anchor(self, field, parts=None): if not parts: parts = self.parts.values() for part in parts: for table in part.findall('.//{%(w)s}tbl' % NAMESPACES): for idx, row in enumerate(table): if row.find('.//MergeField[@name="%s"]' % field) is not None: return table, idx, row return None, None, None def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def close(self): if self.zip is not None: try: self.zip.close() finally: self.zip = None
def __init__( self, fn, only_load_visible_shapes = True, visible_if_ViewObject_missing = True, printLevel=0 ): z = ZipFile( fn ) if printLevel > 0: print( z.namelist() ) print( xml_prettify( z.open('Document.xml').read().decode('utf-8') ) ) if 'GuiDocument.xml' in z.namelist(): print( xml_prettify( z.open('GuiDocument.xml').read().decode('utf-8') ) ) tree_doc = XML_Tree.fromstring( z.open('Document.xml').read().decode('utf-8') ) if 'GuiDocument.xml' in z.namelist(): tree_gui = XML_Tree.fromstring( z.open('GuiDocument.xml').read().decode('utf-8') ) else: tree_gui = None #tree_shapes = ElementTree.fromstring( z.open('PartShape.brp').read() ) doc = Fcstd_Property_List( tree_doc.find('Properties') ) self.__dict__.update( doc.__dict__ ) self.Name = os.path.split( fn )[1][:-6] self.Objects = [] self.Objects_dict = {} #objectData for o in tree_doc.find('ObjectData').findall('Object'): k = o.attrib['name'] assert not k in self.Objects obj = Fcstd_Property_List( o.find('Properties') ) obj.Name = k obj.Content = str(XML_Tree.tostring( o )) self.Objects_dict[k] = obj self.Objects.append( self.Objects_dict[k] ) #viewObjects if tree_gui != None: for o in tree_gui.find('ViewProviderData').findall('ViewProvider'): k = o.attrib['name'] if k in self.Objects_dict: ViewObject = Fcstd_Property_List( o.find('Properties') ) ViewObject.isVisible = isVisible_Bound_Method( ViewObject ) self.Objects_dict[k].ViewObject = ViewObject else: for obj in self.Objects: xml = '<Properties> <Property name="Visibility" type="App::PropertyBool"> <Bool value="%s"/> </Property> </Properties>' % ( 'true' if visible_if_ViewObject_missing else 'false' ) obj.ViewObject = Fcstd_Property_List( XML_Tree.fromstring(xml) ) obj.ViewObject.isVisible = isVisible_Bound_Method( obj.ViewObject ) #shapes for obj in self.Objects: if hasattr( obj, 'Shape'): shape_zip_name = obj.Shape delattr( obj, 'Shape' ) if not only_load_visible_shapes or obj.ViewObject.Visibility: obj.Shape = Part.Shape() obj.Shape.importBrepFromString( z.open( shape_zip_name ).read().decode('utf-8') ) #colour lists for obj in self.Objects: if hasattr( obj, 'ViewObject' ): v = obj.ViewObject if not only_load_visible_shapes or obj.ViewObject.Visibility: for p_name, p_type in zip( v.PropertiesList, v.PropertiesTypes ): if p_type == 'App::PropertyColorList': #print( p_name, getattr(v,p_name) ) fn = getattr(v,p_name) C = parse_Clr_Array( z.open( fn ).read() ) setattr( v, p_name, C )
def getTranslations(localeConfig, projectName, key): """Download all available translations from crowdin. Trigger crowdin to build the available export, wait for crowdin to finish the job and download the generated zip afterwards. """ crowdin_request(projectName, 'export', key) result = crowdin_request(projectName, 'download/all.zip', key, raw=True) zip = ZipFile(StringIO(result)) dirs = {} normalizedDefaultLocale = localeConfig['default_locale'].replace('_', '-') normalizedDefaultLocale = CROWDIN_LANG_MAPPING.get(normalizedDefaultLocale, normalizedDefaultLocale) for info in zip.infolist(): if not info.filename.endswith('.json'): continue dir, file = os.path.split(info.filename) if not re.match(r'^[\w\-]+$', dir) or dir == normalizedDefaultLocale: continue if file.count('.') == 1: origFile = file else: origFile = os.path.splitext(file)[0] for key, value in CROWDIN_LANG_MAPPING.iteritems(): if value == dir: dir = key dir = dir.replace('-', '_') data = zip.open(info.filename).read() if data == '[]': continue if not dir in dirs: dirs[dir] = set() dirs[dir].add(origFile) path = os.path.join(localeConfig['base_path'], dir, origFile) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) if file.endswith('.json'): postprocessChromeLocale(path, data) else: data = json.loads(data) if origFile in data: fileHandle = codecs.open(path, 'wb', encoding='utf-8') fileHandle.write(data[origFile]['message']) fileHandle.close() # Remove any extra files for dir, files in dirs.iteritems(): baseDir = os.path.join(localeConfig['base_path'], dir) if not os.path.exists(baseDir): continue for file in os.listdir(baseDir): path = os.path.join(baseDir, file) valid_extension = file.endswith('.json') if os.path.isfile(path) and valid_extension and not file in files: os.remove(path)
This script is responsible for the update of source data of PyCollatinus. .. author:: Thibault Clérice (@ponteineptique) """ from io import BytesIO from zipfile import ZipFile from urllib3 import PoolManager import glob # Setting up the list of file to update files = [(file, file.replace("pycollatinus/data", "collatinus-master/bin/data")) for file in glob.glob("pycollatinus/data/*.*") if not file.endswith(".pickle")] print("Contacting Github") http = PoolManager() url = http.request( "GET", "https://github.com/biblissima/collatinus/archive/master.zip") print("Reading zip") zipfile = ZipFile(BytesIO(url.data)) for target, source in files: print("\tUpdating {}".format(target)) with zipfile.open(source) as source_io: with open(target, "w") as target_io: target_io.write(source_io.read().decode().replace( "ho!|inv|||interj.|1", "ho|inv|||interj.|1" ) # Known line that creates bug in PyCollatinus ) print("Done")
def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ LOG.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.test_mode: graph = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: graph = self.graph model = Model(graph) # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:' + biogrid_num prefix = self.localtt[id_type] # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) \ and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) \ and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) model.addEquivalentClass(biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': model.addClassToGraph(biogrid_id, id_num) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.test_mode and limit is not None and line_counter > limit: break myzip.close() return
def __init__(self, path="."): if os.path.isfile(path): zip_file = ZipFile(path) open_file = lambda filename: io.TextIOWrapper( zip_file.open(filename), encoding="utf-8") else: assert os.path.isdir(path) open_file = lambda filename: open(os.path.join(path, filename)) self.stops = { stop.stop_id: stop for stop in parse_csv(open_file("stops.txt"), "Stop") } self.routes = { route.route_id: route for route in parse_csv(open_file("routes.txt"), "Route") } self.trips = { trip.trip_id: trip for trip in parse_csv(open_file("trips.txt"), "Trip") } self.stop_times = parse_csv(open_file("stop_times.txt"), "StopTime") try: self.services = { service.service_id: service for service in parse_csv(open_file("calendar.txt"), "Calendar") } except Exception as e: print(e) self.services = {} self.agency = { agency.agency_id: agency for agency in parse_csv(open_file("agency.txt"), "Agency") } self.shapes = {} try: for point in parse_csv(open_file("shapes.txt"), "Shape"): if point.shape_id not in self.shapes: self.shapes[point.shape_id] = [] self.shapes[point.shape_id].append(point) except: pass for shape in self.shapes.values(): shape.sort(key=lambda point: int(point.shape_pt_sequence)) self.stop_times_by_trip_id = {} for stop_time in self.stop_times: if stop_time.trip_id not in self.stop_times_by_trip_id: self.stop_times_by_trip_id[stop_time.trip_id] = [] self.stop_times_by_trip_id[stop_time.trip_id].append(stop_time) for stop_time_list in self.stop_times_by_trip_id.values(): stop_time_list.sort( key=lambda stop_time: int(stop_time.stop_sequence)) self.trips_by_list_of_stops = {} for trip in self.trips.values(): trip_stops = self.trip_stops_ids(trip.trip_id) if trip_stops not in self.trips_by_list_of_stops: self.trips_by_list_of_stops[trip_stops] = set() self.trips_by_list_of_stops[trip_stops].add(trip.trip_id) self.all_lists_of_stops = sorted(self.trips_by_list_of_stops.keys())
from StringIO import StringIO as ReaderIO from urllib import urlopen except ImportError: from io import BytesIO as ReaderIO from urllib.request import urlopen print('Downloading large collection of URDF from Drake project...') print('This might take a few minutes...') resp = urlopen( 'https://github.com/RobotLocomotion/drake/archive/master.zip') zipfile = ZipFile(ReaderIO(resp.read())) errors = [] all_files = [] for f in zipfile.namelist(): if f.endswith('.urdf') or f.endswith('.xacro'): with zipfile.open(f) as urdf_file: try: all_files.append(f) r = Robot.from_urdf_file(urdf_file) except Exception as e: errors.append((f, e)) print('Found %d files and parsed successfully %d of them' % (len(all_files), len(all_files) - len(errors))) if len(errors): print('\nErrors found during parsing:') for error in errors: print(' * File=%s, Error=%s' % error)