def __init__(cls, **retrieval_kwargs): """Retrieve sensor information from the InfluencAir project. Args: retrieval_kwargs: keyword arguments to pass to retrieve function Raises: KeyError if sheet structure does not match listed columns """ sensor_info = retrieve(SENSOR_INFO_CACHE_FILE, SENSOR_SHEET_DOWNLOAD_URL, "InfluencAir sensor information", read_func=pd.read_csv, read_func_kwargs={"header": 1, "dtype": "object"}, **retrieval_kwargs) try: sensor_info = (sensor_info[["Chip ID", "PM Sensor ID", "Hum/Temp Sensor ID", "Label", "Address", "Floor", "Side (Street/Garden)"]] .rename(columns={"Side (Street/Garden)": "Side"})) except KeyError: raise KeyError("Could not get columns. Check if the structure or " "labels of the InfluencAir sensor Google Sheet " "have changed.") cls.sensors = sensor_info cls.initialized = True
class PricerServer(object): _reg_progid_ = "ppf.pricer" _reg_clsid_ = "{08632905-0B63-45B5-B388-30C73CAE611C}" _public_methods_ = \ [ "CreateHullWhiteLatticePricer" , "InvokePricer" ] _pricers = {} retrieve = staticmethod(lambda tag, which: utils.retrieve( 'pricer_server', 'PricerServer', tag, which)) def CreateHullWhiteLatticePricer(self, tag, trade_id, env_id, num_states, num_std_dev): try: from trade_server import TradeServer from market_server import MarketServer trade = TradeServer.retrieve(trade_id, 'trades') env = MarketServer.retrieve(env_id, 'environments') model_args = {"num states": num_states, "num std dev": num_std_dev} factory = ppf.model.hull_white_lattice_model_factory() model = factory(trade, env, model_args) pricer = ppf.pricer.lattice_pricer(trade, model, env, None) PricerServer._pricers[tag] = pricer return tag except RuntimeError, e: ppf.com.utils.raise_com_exception(e)
def get_stations(cls, **retrieval_kwargs): """Retrieve a list of measuring stations. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ # Retrieve and reshape data stations = retrieve(STATIONS_CACHE_FILE, STATIONS_URL, "station metadata", **retrieval_kwargs) stations = (stations.drop(columns=["geometry.type", "type"]).rename( columns={ "properties.id": "id", "properties.label": "label" }).set_index("id")) # Split coordinates into columns coords = pd.DataFrame( [row for row in stations["geometry.coordinates"]], index=stations.index) stations[["lon", "lat", "alt"]] = coords stations.drop(columns=["geometry.coordinates", "alt"], inplace=True) cls.stations = stations
def _add_arch_linux_libcs(): def _find_packages_urls(architecture): url = "https://archive.archlinux.org/packages/g/glibc/" try: packages_filenames = utils.findall( fr"['\"](?P<filename>glibc-(?:.*?)-{architecture}\.pkg\.tar\.[gx]z)['\"]", url, ) except AttributeError: print(utils.make_warning(f"Problems: {utils.make_bright(url)}")) return [] else: packages_urls = [ os.path.join(url, package_filename) for package_filename in packages_filenames ] return packages_urls distro_dirpath = os.path.join(utils.get_libcs_dirpath(), "arch") os.makedirs(distro_dirpath, exist_ok=True) for architecture in ("i686", "x86_64"): for package_url in _find_packages_urls(architecture): if _already_in_db(package_url): print(f"Skipping: {utils.make_bright(package_url)}") continue with tempfile.TemporaryDirectory() as tmp_dirpath: print(f"Downloading: {utils.make_bright(package_url)}") package_filepath = utils.retrieve(package_url, tmp_dirpath) add(package_filepath, dest_dirpath=distro_dirpath)
def _add_debian_libcs(): def _find_packages_urls(release, architecture, package): url = f"https://packages.debian.org/{release}/{architecture}/{package}/download" try: package_url = utils.search( r"['\"](?P<url>https?.*?libc6.*?.deb)['\"]", url).group("url") except AttributeError: print(utils.make_warning(f"Problems: {utils.make_bright(url)}")) return [] else: return [package_url] distro_dirpath = os.path.join(utils.get_libcs_dirpath(), "debian") os.makedirs(distro_dirpath, exist_ok=True) for release in ("squeeze", "wheezy", "jessie", "stretch", "buster"): release_dirpath = os.path.join(distro_dirpath, release) os.makedirs(release_dirpath, exist_ok=True) for architecture in ("i386", "amd64"): for package in ("libc6", "libc6-dbg"): for package_url in _find_packages_urls(release, architecture, package): if _already_in_db(package_url): print(f"Skipping: {utils.make_bright(package_url)}") continue with tempfile.TemporaryDirectory() as tmp_dirpath: print(f"Downloading: {utils.make_bright(package_url)}") package_filepath = utils.retrieve( package_url, tmp_dirpath) add(package_filepath, dest_dirpath=release_dirpath)
def text(i, url, params): """ 抓取保存文章正文 """ content = retrieve(url, params) tree = etree.HTML(content) t = tree.xpath("//div[@id='article_content']/*") _content = ''.join(map(lambda x: etree.tostring(x, encoding='utf-8'), t)) data[i].append(_content)
def text(i, url, params): """ 抓取保存文章正文 """ content = retrieve(url, params) tree = etree.HTML(content) t = tree.xpath("//section[@class='article']/*") _content = ''.join(map(lambda x: etree.tostring(x, encoding='utf-8'), t)) print len(_content) data[i].append(_content)
def all_words(): data = retrieve('ordmyndalisti') if data is not None: return data else: data1 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti.txt'), 'r').read()) data2 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti2.txt'), 'r').read()) data = data1+data2 store('ordmyndalisti', data) return data
def get_time_series(cls, **retrieval_kwargs): """Retrieve information on available time series: a collection of station & phenomenon combinations. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ def get_phenomenon_name(label): """Extract phenomenon name from time series label.""" phenomenon_name_series_id = (label .split(sep=" - ", maxsplit=1)[0]) phenomenon_name = phenomenon_name_series_id.rsplit(maxsplit=1)[0] return phenomenon_name # Retrieve and reshape data time_series = retrieve(TIME_SERIES_CACHE_FILE, API_ENDPOINTS["timeseries"], "time series metadata", **retrieval_kwargs) time_series["id"] = time_series["id"].astype("int") time_series = (time_series .set_index("id") .drop(columns=["station.geometry.type", "station.type"]) .rename(columns={"station.properties.id": "station_id", "station.properties.label": "station_label", "uom": "unit"})) # Extract phenomenon names from labels labels = time_series["label"] time_series["phenomenon"] = labels.apply(get_phenomenon_name) # Split coordinates into columns coords = pd.DataFrame([row for row in time_series["station.geometry.coordinates"]], index=time_series.index) time_series[["station_lat", "station_lon"]] = coords[[1, 0]] # Sort and drop columns time_series = time_series[["label", "phenomenon", "unit", "station_id", "station_label", "station_lat", "station_lon"]] # Clean unit descriptors time_series["unit"] = (time_series["unit"] .str.replace("m3", "m³") .str.replace("ug", "µg")) (time_series .loc[time_series["phenomenon"] == "temperature", "unit"]) = "°C" cls.time_series = time_series
def get_phenomena(cls, **retrieval_kwargs): """Retrieve a list of measured phenomena. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ phenomena = retrieve(PHENOMENA_CACHE_FILE, API_ENDPOINTS["phenomena"], "phenomenon metadata", **retrieval_kwargs) phenomena["id"] = phenomena["id"].astype("int") phenomena = phenomena.set_index("id").sort_index() cls.phenomena = phenomena
def profile(self, url, params): tree = etree.HTML(retrieve(url, params)) for x in tree.xpath("//ul[@class='daily-list']/li"): cover = x.find('a/img').attrib['src'] title = x.find("div[@class='daily-cont']/h2/a").text url = 'http://zhidao.baidu.com' + x.find("div[@class='daily-cont']/h2/a").attrib['href'] brief = x.find("div[@class='daily-cont']/div[@class='summer']/a").text author = '知道日报' code = 'zdrb' self.data.append([url, cover, title, brief, author, code])
def get_phenomena(cls, **retrieval_kwargs): """Retrieve a list of measured phenomena. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ phenomena = retrieve(PHENOMENA_CACHE_FILE, PHENOMENA_URL, "phenomenon metadata", **retrieval_kwargs) # FIXME: id not converted to int phenomena.set_index("id", inplace=True) phenomena.sort_index(inplace=True) cls.phenomena = phenomena
def post(): title = 'Artist Search Result' if request.method == 'POST': kwds = request.form['keyword'] try: type = request.form['search_type'] except: message = '検索手法を選択してください' return render_template('index.html', title=title, message=message) result = retrieve(kwds, type) return render_template('index.html', title=title, keyword=kwds, result=result) else: return redirect(url_for('index'))
def get_metadata(self, **retrieval_kwargs): """Get sensor metadata and current measurements from cache or luftdaten.info API. Args: retrieval_kwargs: keyword arguments to pass to retrieve function Warns: UserWarning if sensor does not appear to be online """ # Get and cache metadata and measurements of past five minutes filename = os.path.basename(self.metadata_url.rstrip("/")) + ".json" filepath = os.path.join(CACHE_DIR, filename) parsed = retrieve( filepath, self.metadata_url, "sensor {} metadata from luftdaten.info".format(self.sensor_id), **retrieval_kwargs) try: metadata = (parsed.drop( columns=["sensordatavalues", "timestamp"]).iloc[0]) except ValueError: warnings.warn("Sensor metadata could not be retrieved") else: metadata.name = "metadata" self.metadata = metadata # Extract metadata into corresponding properties self.sensor_type = metadata["sensor.sensor_type.name"] self.lat = float(metadata["location.latitude"]) self.lon = float(metadata["location.longitude"]) self.label = "at " + label_coordinates(self.lat, self.lon) # Extract most current measurements current = parsed["sensordatavalues"].iloc[-1] current = (json_normalize(current).replace({ "P1": "pm10", "P2": "pm2.5" }).set_index("value_type")["value"]) current = (pd.to_numeric(current).replace([999.9, 1999.9], pd.np.nan)) self.current_measurements = dict(current) self.phenomena = list(current.index) self.units = { phenomenon: UNITS[phenomenon] for phenomenon in UNITS if phenomenon in self.phenomena }
def profile(url, params): """ 抓取保存封面,标题,摘要 """ content = retrieve(url, params) tree = etree.HTML(content) for x in tree.xpath("//article[@class='posts post-1 cf']"): cover = x.find("div[@class='left left-col']/div[2]/a/img").attrib['data-src'] title = x.find("div[@class='right-col']/h1/a").text url = 'http://www.36kr.com' + x.find("div[@class='right-col']/h1/a").attrib['href'] brief = x.find("div[@class='right-col']/p").text author = '36氪' code = '36kr' data.append([url, cover, title, brief, author, code])
def profile(url, params): """ 抓取保存封面,标题,摘要 """ content = retrieve(url, params) tree = etree.HTML(content) for x in tree.xpath("//div[@class='clearfix mod-b mod-art']"): cover = x.find('a/img').attrib['src'] title = x.find("div/h3/a").text url = 'http://www.huxiu.com' + x.find("div/h3/a").attrib['href'] brief = x.find("div/div[2]").text author = '虎嗅网' code = 'hx' data.append([url, cover, title, brief, author, code])
def process_source(source): fname = None _buffer = utils.retrieve(source) (_, tmpfname) = tempfile.mkstemp() tmpf = open(tmpfname, "w") tmpf.write(_buffer.getvalue()) tmpf.close() archive = utils.get_archive_type(tmpfname) if archive == "gzip": fname = utils.ungzip(tmpfname) os.remove(tmpfname) elif archive == "bzip": fname = utils.unbzip(tmpfname) os.remove(tmpfname) elif archive is False: fname = tmpfname return (fname, True)
def get_time_series(cls, **retrieval_kwargs): """Retrieve information on available time series: a collection of station & phenomenon combinations. Args: retrieval_kwargs: keyword arguments to pass to retrieve function """ def get_phenomenon_name(label): """Extract phenomenon name from time series label.""" phenomenon_name_series_id = (label.split(sep=" - ", maxsplit=1)[0]) phenomenon_name = phenomenon_name_series_id.rsplit(maxsplit=1)[0] return phenomenon_name # Retrieve and reshape data time_series = retrieve(TIME_SERIES_CACHE_FILE, TIME_SERIES_URL, "time series metadata", **retrieval_kwargs) time_series.set_index("id", inplace=True) time_series.drop(columns=["station.geometry.type", "station.type"], inplace=True) time_series.rename(columns={ "station.properties.id": "station_id", "station.properties.label": "station_label", "uom": "unit" }, inplace=True) # Extract phenomenon names from labels labels = time_series["label"] time_series["phenomenon"] = labels.apply(get_phenomenon_name) # Split coordinates into columns coords = pd.DataFrame( [row for row in time_series["station.geometry.coordinates"]], index=time_series.index) time_series[["station_lon", "station_lat", "station_alt"]] = coords # Sort and drop columns time_series = time_series[[ "label", "phenomenon", "unit", "station_id", "station_label", "station_lon", "station_lat" ]] cls.time_series = time_series
def _add_ubuntu_libcs(): def _find_packages_urls(release, architecture, package): url = f"https://launchpad.net/ubuntu/{release}/{architecture}/{package}" packages_versions = set( utils.findall( fr'"/ubuntu/.+?/{package}/(?P<version>.+?)(?:\.\d+)?"', url)) if not packages_versions: print(utils.make_warning(f"Problems: {utils.make_bright(url)}")) return [] n = 3 most_recent_packages_versions = sorted(packages_versions, reverse=True)[:n] packages_urls = [ utils.search( r"['\"](?P<url>https?.*?libc6.*?.deb)['\"]", f"https://launchpad.net/ubuntu/{release}/{architecture}/{package}/{package_filename}", ).group("url") for package_filename in most_recent_packages_versions ] if not packages_urls: print(utils.make_warning(f"Problems: {utils.make_bright(url)}")) return [] return packages_urls distro_dirpath = os.path.join(utils.get_libcs_dirpath(), "ubuntu") os.makedirs(distro_dirpath, exist_ok=True) for release in ("trusty", "xenial", "artful", "bionic"): release_dirpath = os.path.join(distro_dirpath, release) os.makedirs(release_dirpath, exist_ok=True) for architecture in ("i386", "amd64"): for package in ("libc6", "libc6-dbg"): for package_url in _find_packages_urls(release, architecture, package): if _already_in_db(package_url): print(f"Skipping: {utils.make_bright(package_url)}") continue with tempfile.TemporaryDirectory() as tmp_dirpath: print(f"Downloading: {utils.make_bright(package_url)}") package_filepath = utils.retrieve( package_url, tmp_dirpath) add(package_filepath, dest_dirpath=release_dirpath)
class TradeServer(object): _reg_progid_ = "ppf.trade" _reg_clsid_ = "{E33DA322-B011-4FE9-8AB9-87A964EDD046}" _public_methods_ = \ [ "GenerateFixedCouponObservables" , "GenerateLiborObservables" , "GenerateFlows" , "GenerateAdjuvantTable" , "GenerateExerciseSchedule" , "CreateLeg" , "CreateTrade" ] _observables = {} _flows = {} _adjuvants = {} _legs = {} _exercises = {} _trades = {} retrieve = staticmethod(lambda tag, which: utils.retrieve( 'trade_server', 'TradeServer', tag, which)) def GenerateFixedCouponObservables(self, tag, start, end, roll_period, roll_duration, reset_currency, coupon_shift_method, coupon_rate): try: observables = \ ppf.core.generate_fixed_coupon_observables( start=utils.to_ppf_date(start) , end=utils.to_ppf_date(end) , roll_period=roll_period , roll_duration=eval("ppf.date_time."+roll_duration) , reset_currency=reset_currency , coupon_shift_method= eval("ppf.date_time.shift_convention."+coupon_shift_method) , coupon_rate=coupon_rate) TradeServer._observables[tag] = observables return tag except RuntimeError, e: utils.raise_com_exception(e)
class MarketServer(object): _reg_progid_ = "ppf.market" _reg_clsid_ = "{CAFAEEDF-E876-4DD6-9B6F-7038EDA25BCD}" _public_methods_ = \ [ "CreateEnvironment" , "EraseEnvironment" , "AddCurve" , "AddSurface" , "AddConstant" , "ListKeys" ] _environments = {} retrieve = staticmethod(lambda tag, which: utils.retrieve( 'market_server', 'MarketServer', tag, which)) def CreateEnvironment(self, tag, t): try: MarketServer._environments[tag] = \ ppf.market.environment(utils.to_ppf_date(t)) return tag except RuntimeError, e: utils.raise_com_exception(e)
ret.add_argument("repo_dir2", type=str, help="source filename 2") ret.add_argument("--destdir", type=str, help="result filename", required=True) return ret if __name__ == "__main__": ap = make_arg_parser() args = ap.parse_args() fst_repo_files = {} snd_repo_files = {} for target, source in zip((fst_repo_files, snd_repo_files), (args.repo_dir1, args.repo_dir2)): p = parser.Parser() strbuffer = utils.retrieve(os.path.join(source, "repodata", "repomd.xml")) parsed = p.parse_str(strbuffer.getvalue()) for data in parsed.get("repomd.data").objects: url = data.get("location.href") _type = data.get("type") target[_type] = os.path.join(source, url) missing_in_1 = set(fst_repo_files) - set(snd_repo_files) missing_in_2 = set(snd_repo_files) - set(fst_repo_files) print "missing types in 1 repo %s" % ",".join(missing_in_1) print "missing types in 2 repo %s" % ",".join(missing_in_2) common = set(fst_repo_files) & set(snd_repo_files) if not os.path.exists(args.destdir): os.mkdir(args.destdir) for _type in common:
def get_measurements(self, start_date, end_date, **retrieval_kwargs): """Get measurement data of the sensor in a given period. Data are read from cache if available, or downloaded from luftdaten.info and saved to cache as retrieved, and then cleaned for self.measurements. If the instance already has data associated with it, calling this method replaces them. Args: start_date: first date of data to retrieve, in ISO 8601 (YYYY-MM-DD) format end_date: last date of data to retrieve, in ISO 8601 (YYYY-MM-DD) format retrieval_kwargs: keyword arguments to pass to retrieve function """ sid = self.sensor_id if self.sensor_type is None: self.sensor_type = input("Type of sensor {} has not been set yet. " "Enter sensor type: ".format(sid)) stype = self.sensor_type.lower() # Get and process the data file for each date in the requested range daily_data = [] for date in pd.date_range(start_date, end_date): date_iso = date.strftime("%Y-%m-%d") filename = ARCHIVE_FILENAME_PATTERN.format(date=date_iso, sensor_type=stype, sensor_id=sid) filepath = os.path.join(CACHE_DIR, filename) url = ARCHIVE_URL_PATTERN.format(date=date_iso, filename=filename) data = retrieve(filepath, url, "luftdaten.info data for sensor {} on {}".format( sid, date_iso), read_func=pd.read_csv, read_func_kwargs={"sep": ";"}, **retrieval_kwargs) if data is None: continue # Parse timestamps and make them timezone aware timestamps = pd.to_datetime(data["timestamp"], utc=True) # Reformat data according to sensor type data.set_index(timestamps, inplace=True) if self.sensor_type in ("SDS011", "HPM"): data = (data[["P1", "P2"]].rename(columns={ "P1": "pm10", "P2": "pm2.5" })) elif self.sensor_type == "DHT22": data = data[["temperature", "humidity"]] else: raise NotImplementedError("No data parsing method implemented " "for sensor type {}".format( self.sensor_type)) daily_data.append(data) # If daily data were retrieved, concatenate them to a single dataframe if daily_data: self.measurements = pd.concat(daily_data) else: self.measurements = None print("No data for sensor", sid) return # Remove duplicates duplicates = self.measurements.index.duplicated(keep="last") self.measurements = self.measurements[~duplicates] self.measurements.sort_index(inplace=True) self.clean_measurements()
def text(self, i, url, params): content = retrieve(url, params) tree = etree.HTML(content) t = tree.xpath("//div[@id='daily-cont']/*") _content = ''.join(map(lambda x: etree.tostring(x, encoding='utf-8'), t)) self.data[i].append(_content)
def get_data(time_series, start_date, end_date, **retrieval_kwargs): """Retrieve time series data. Args: time_series: time series ID as listed in Metadata.time_series start_date: date string in ISO 8601 format. Interpreted as UTC. end_date: date string like start. If the current date or a future date is entered, end will be truncated so that only complete days are downloaded. retrieval_kwargs: keyword arguments to pass to retrieve function Returns: Dataframe of values, indexed by hourly periods Raises: ValueError if start_date is later than end_date """ # Make start and end timezone aware and truncate time values query_start_date = pd.to_datetime(start_date, format="%Y-%m-%d", utc=True).normalize() query_end_date = pd.to_datetime(end_date, format="%Y-%m-%d", utc=True).normalize() # Check validity of input and truncate end date if needed today = pd.to_datetime("today", utc=True) yesterday = today - pd.Timedelta(days=1) if query_end_date > yesterday: # TODO: Raise warning query_end_date = yesterday end_date = query_end_date.strftime("%Y-%m-%d") if query_start_date > query_end_date: raise ValueError("end_date must be greater than or equal to " "start_date") # IRCELINE API takes local times. Convert start and end accordingly. query_start_dt = query_start_date.tz_convert("Europe/Brussels") query_start_dt_formatted = query_start_dt.strftime("%Y-%m-%dT%H") query_end_dt = query_end_date.tz_convert("Europe/Brussels") query_end_dt = (query_end_dt - pd.Timedelta(1, "s")) query_end_dt_formatted = query_end_dt.strftime("%Y-%m-%dT%H:%M:%S") url = DATA_URL_PATTERN.format(time_series_id=time_series, start=query_start_dt_formatted, end=query_end_dt_formatted) # TODO: Split response into days and cache as daily files. Also check cache # day by day. Find longest missing intervals to make as few requests # as possible. filename = ( "irceline_{time_series_id}_{start_date}_{end_date}.json".format( time_series_id=time_series, start_date=start_date, end_date=end_date)) filepath = os.path.join(CACHE_DIR, filename) # TODO: Check day by day if data are cached # Retrieve and parse data data = retrieve(filepath, url, "IRCELINE timeseries data", **retrieval_kwargs) data = pd.DataFrame.from_dict(data.loc[0, "values"]) # Convert Unix timestamps to datetimes and then to periods for index timestamps = pd.to_datetime(data["timestamp"], unit="ms", utc=True) periods = timestamps.dt.to_period(freq="h") data = pd.Series(data["value"].values, index=periods, dtype="float") return data
def get_measurements(self, start_date, end_date, **retrieval_kwargs): """Retrieve time series data. Args: start_date: date string in ISO 8601 (YYYY-MM-DD) format. Interpreted as UTC. end_date: date string like start_date. If the current date or a future date is entered, end will be truncated so that only complete days are downloaded. retrieval_kwargs: keyword arguments to pass to retrieve function Raises: ValueError if start_date is later than end_date """ # Make start and end timezone aware and truncate time values query_start_date = pd.to_datetime(start_date, format="%Y-%m-%d", utc=True).normalize() query_end_date = (pd.to_datetime(end_date, format="%Y-%m-%d", utc=True).normalize() + pd.Timedelta(days=1)) # To include end_date data # Check validity of input and truncate end date if needed today = pd.to_datetime("today", utc=True) if query_end_date > today: warnings.warn("Resetting end_date to yesterday") yesterday = today - pd.Timedelta(days=1) end_date = yesterday.strftime("%Y-%m-%d") query_end_date = today # 00:00, to include yesterday's data if query_start_date > query_end_date: raise ValueError("end_date must be greater than or equal to " "start_date") # IRCELINE API takes local times. Convert start and end accordingly. query_start_local = query_start_date.tz_convert("Europe/Brussels") query_start_local_str = query_start_local.strftime("%Y-%m-%dT%H") query_end_local = query_end_date.tz_convert("Europe/Brussels") query_end_local -= pd.Timedelta(1, "s") query_end_local_str = query_end_local.strftime("%Y-%m-%dT%H:%M:%S") url = (API_ENDPOINTS["data pattern"] .format(time_series_id=self.sensor_id, start=query_start_local_str, end=query_end_local_str)) # TODO: Split response into days and cache as daily files; check cache # day by day. Find longest missing intervals to make as few # requests as possible. filename = ("irceline_{time_series_id}_{start_date}_{end_date}.json" .format(time_series_id=self.sensor_id, start_date=start_date, end_date=end_date)) filepath = os.path.join(CACHE_DIR, filename) # TODO: Check day by day if data are cached # Retrieve and parse data data = retrieve(filepath, url, "IRCELINE timeseries data", **retrieval_kwargs) data = pd.DataFrame.from_dict(data.loc[0, "values"]) if len(data) == 0: return data["value"] = data["value"].astype("float") data = data.rename(columns={"value": self.metadata["phenomenon"]}) # Convert Unix timestamps to datetimes and then to periods for index data.index = (pd.to_datetime(data["timestamp"], unit="ms", utc=True) .dt.to_period(freq="h")) data.index.name = "Period" data = data.drop(columns=["timestamp"]) self.measurements = data
if not window2: window2 = make_win2() # Tells cprint which widget element to print the colored text in sg.cprint_set_output_destination(window2, '-WIN2 TEXT-') #Clear output box window2['-WIN2 TEXT-']('') #Get user query query = values['-QUERY-'] #Get the list of pdfs/txt_files user has chosen files = values['-FILE LIST-'] output_sents_zipped, num_sents_found = retrieve(files, query, pdf_obj, txt_obj) #Display the legend for i, file in enumerate(files): # Create legend denoting which color corresponds to which document window2['-LEGEND-'].print(file, end='', background_color=background_colors[i]) window2['-LEGEND-'].print('\n', end='') #Print the color-coded output sentences print_output_sents(output_sents_zipped, files, background_colors, cprint) #Print the number of sentences that were found window2['-WIN2 NUM SENTS-'].update(f"Found {num_sents_found} sentences containing '{query}'.") #For word2vec
type=str, help="result filename", required=True) return ret if __name__ == "__main__": ap = make_arg_parser() args = ap.parse_args() fst_repo_files = {} snd_repo_files = {} for target, source in zip((fst_repo_files, snd_repo_files), (args.repo_dir1, args.repo_dir2)): p = parser.Parser() strbuffer = utils.retrieve( os.path.join(source, "repodata", "repomd.xml")) parsed = p.parse_str(strbuffer.getvalue()) for data in parsed.get("repomd.data").objects: url = data.get("location.href") _type = data.get("type") target[_type] = os.path.join(source, url) missing_in_1 = set(fst_repo_files) - set(snd_repo_files) missing_in_2 = set(snd_repo_files) - set(fst_repo_files) print "missing types in 1 repo %s" % ",".join(missing_in_1) print "missing types in 2 repo %s" % ",".join(missing_in_2) common = set(fst_repo_files) & set(snd_repo_files) if not os.path.exists(args.destdir): os.mkdir(args.destdir) for _type in common: