def dump_data(config): train_raw, dev_raw = get_raw_data(config) dev_data = mfcc_all(dev_raw, config) pickle.dump(dev_data, open(os.path.join(config['mfcc']['data_path'], config['data']['dev_file']), 'wb')) train_data = mfcc_all(train_raw, config) pickle.dump(train_data, open(os.path.join(config['mfcc']['data_path'], config['data']['train_file']), 'wb'))
def import_script(script_name=None, level_name=None): # get name of script file if not script_name: #if script_file not defined while True: print "Please enter the name of the script file." print "The current working directory is " print print os.getcwd() script_name = raw_input("Path to .6vscript file: ") if not script_name: print "You must specify a script to import." continue else: try: with open(script_name): pass except IOError: print 'File not found.' else: break print # Checks whether level_name specified beforehand (for quiet execution) while not level_name: print "Please enter the filename of the level" print "(do not include .vvvvvv or else bad things will happen)" level_name = utils.get_level_name() if not level_name: print "You must enter a level name" # backup level file print "Backing up level file..." backup_file = utils.level_backup(level_name) print "Backup saved to " + backup_file # get raw level data from file level_data = utils.get_raw_data(utils.get_vvvvvv_dir(), level_name) # get raw script data from file raw_script_data = utils.get_script_filedata(script_name) # convert script data to raw data script_data = utils.script_to_raw(raw_script_data) if not script_data: raise IOError # Adding script data to level data in memory utils.import_script_data(level_data, script_data) # going hot! success = utils.write_level_data(utils.get_vvvvvv_dir(), level_name, level_data) if success: print "File successfully written." else: print "An error occurred when writing the file."
def run(): confirmed_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' deaths_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv' recovered_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv' confirmed_total_data = get_raw_data(confirmed_CSV_URL) deaths_total_data = get_raw_data(deaths_CSV_URL) recovered_total_data = get_raw_data(recovered_CSV_URL) korea_deaths = extract_data(confirmed_total_data) korea_recovered = extract_data(recovered_total_data) korea_confirmed, date = extract_data(deaths_total_data, return_dates=True) korea_data = list(zip(date, korea_confirmed, korea_deaths, korea_recovered)) result = build_result(korea_data) save_dir = './data/koreaRegionalCumulativeData.js' crawler_name = 'crawlKoreaRegionalCumulativeData.py' var_name = 'koreaRegionalCumulativeData' write_data(result, save_dir, crawler_name, var_name)
def data_loader(args): train_data, train_labels = utils.get_raw_data(args.train_file) # 获取一堆句子构成的列表 val_data, val_labels = utils.get_raw_data(args.dev_file) args.catogories = ['EnterSports', 'Military', 'Economics', 'Technology', 'Government'] args.cat_dict = dict(zip(args.catogories, range(len(args.catogories)))) word_vocab, num_total_words = utils.build_dict(train_data) trainlabels_to_idx = [args.cat_dict[label] for label in train_labels] vallabels_to_idx = [args.cat_dict[label] for label in val_labels] train_data, train_labels = utils.encode(train_data, trainlabels_to_idx, word_vocab) val_data, val_labels = utils.encode(val_data, vallabels_to_idx, word_vocab) train_data = utils.pad_features(train_data, max_len=args.max_features) val_data = utils.pad_features(val_data, max_len=args.max_features) train_set = utils.batch(train_data.copy(), train_labels.copy(), args.batch_size) val_set = utils.batch(val_data.copy(), val_labels.copy(), args.batch_size) return train_set, val_set, num_total_words
def run(): confirmed_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' deaths_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv' recovered_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv' confirmed_total_data = get_raw_data(confirmed_CSV_URL) deaths_total_data = get_raw_data(deaths_CSV_URL) recovered_total_data = get_raw_data(recovered) total_data, num_date = build_total_data(confirmed_total_data) countries = [ 'US', 'United Kingdom', 'Australia', 'Canada', 'China', 'Congo', 'Denmark', 'France', 'Netherlands' ] final_data = concat_countries_data(total_data, num_date, countries) save_dir = './data/HopkinsCoronaWorldData.js' crawler_name = 'Hopkins_world_data_parser.py' var_name = 'hopkinsData' write_data(final_data, save_dir, crawler_name, var_name)
def fetch_osm_pumps(path, outpath): create_folder(path) # specify query # (area["ISO3166-2"="DE-BE"][admin_level=4]; )->.searchArea;(node["man_made"="water_well"]["network"="Berliner Straßenbrunnen"](area.searchArea);); query_string = "http://overpass-api.de/api/interpreter?data=%5Bout%3Ajson%5D%3B%28area%5B%22ISO3166%2D2%22%3D%22DE%2DBE%22%5D%5B%22admin%5Flevel%22%3D%224%22%5D%3B%29%2D%3E%2EsearchArea%3B%28node%5B%22man%5Fmade%22%3D%22water%5Fwell%22%5D%5B%22network%22%3D%22Berliner%20Straßenbrunnen%22%5D%28area%2EsearchArea%29%3B%29%3Bout%3B%3E%3Bout;" # get data and write to json raw_data = get_raw_data(query_string) json = raw_data.json() # transform and write to dataframe gdf = get_overpass_gdf(json) cleaned_gdf = transform_dataframe(gdf) write_df_to_json(cleaned_gdf,outpath)
def main(): model_type = "lda" exp_name = "PCA/scaled/all_samples" if exp_name.split("/")[1] == "scaled": scaled = True else: scaled = False all_sample_results = np.zeros((21, 50)) for i, sample in enumerate(range(1, 22)): print("sample {}".format(sample)) if exp_name == "raw/all_samples": x_train, y_train = get_raw_data(sample, scale=scaled) else: epochs = get_epochs(sample, scale=scaled) reduced_data = pca(80, epochs, plot=False) x_train = reduced_data.transpose(0, 2, 1).reshape(-1, reduced_data.shape[1]) y_train = get_y_train(sample) results = linear_models(x_train, y_train, model_type=model_type) all_sample_results[i] = results sns.set() ax = sns.lineplot(data=results, dashes=False) ax.set(ylim=(0, 0.6), xlabel='Time', ylabel='Accuracy', title='Cross Val Accuracy {} for sample {}'.format(model_type, sample)) plt.axvline(x=15, color='b', linestyle='--') ax.figure.savefig("Results/{}/{}/sample{}".format(model_type, exp_name, sample), dpi=300) # plt.show() plt.clf() all_results_df = pd.DataFrame(all_sample_results) all_results_df.to_csv("Results/{}/{}/all_sample_results.csv".format(model_type, exp_name)) average_results = np.mean(all_sample_results, axis=0) sns.set() ax = sns.lineplot(data=average_results, dashes=False) ax.set(ylim=(0, 0.6), xlabel='Time', ylabel='Accuracy', title='Average Cross Val Accuracy {} across all samples'.format(model_type)) plt.axvline(x=15, color='b', linestyle='--') ax.figure.savefig("Results/{}/{}/average_all_samples".format(model_type, exp_name), dpi=300) # plt.show() plt.clf()
help='batch normalization') parser.set_defaults(shuffle=False) args = parser.parse_args() print args assert args.word_vector_size in [50, 100, 200, 300] network_name = args.prefix + '%s.mh%d.n%d.bs%d%s%s%s.babi%s' % ( args.network, args.memory_hops, args.dim, args.batch_size, ".na" if args.normalize_attention else "", ".bn" if args.batch_norm else "", (".d" + str(args.dropout)) if args.dropout > 0 else "", args.input_train.split("/")[-1]) # Go and get the data from the folders; see utils class. train_raw, test_raw = utils.get_raw_data(args.input_train, args.input_test) # Initialize word2vec with utils.load_glove word2vec = utils.load_glove(args.word_vector_size) args_dict = dict(args._get_kwargs()) args_dict['train_raw'] = train_raw args_dict['test_raw'] = test_raw args_dict['word2vec'] = word2vec # init class if args.network == 'dmn_batch': import dmn_batch dmn = dmn_batch.DMN_batch(**args_dict) # The basic module is implemented for document similarity
import pdf2image from utils import draw_boxes from pathlib import Path from utils import get_raw_data, merge_blocks, create_order, get_blocks, remove_empty if __name__ == '__main__': pdf_dir = "/home/mahad/abbyy_dummy_dataset/pdf" xml_dir = "/home/mahad/abbyy_dummy_dataset/xml" save_dir = "/tmp" pdf_files = os.listdir(pdf_dir) xml_files = os.listdir(xml_dir) for xml_file in xml_files: print(xml_file) xml_path = os.path.join(xml_dir, xml_file) pdf_path = os.path.join(pdf_dir, Path(xml_file).stem + ".pdf") xml_data = get_raw_data(xml_path) for page in xml_data: para_boxes = page["para_boxes"] para_texts = page["para_texts"] para_boxes, para_texts = remove_empty(para_boxes, para_texts) tables = page["tables"] table_boxes = [tt["bbox"] for tt in tables] table_texts = [tt["rows"] for tt in tables] img = pdf2image.convert_from_path(pdf_path, size=(page["width"], page["height"]), first_page=page["page_number"], last_page=page["page_number"]) img = np.asarray(img[0]) all_boxes = para_boxes + table_boxes all_texts = para_texts + table_texts column_blocks = get_blocks((page["height"], page["width"]), all_boxes) column_blocks_merged = merge_blocks(column_blocks, all_boxes) ordered_boxes = create_order(column_blocks_merged, all_boxes)
assert args.word_vector_size in [50, 100, 200, 300] network_name = args.prefix + "%s.mh%d.n%d.bs%d%s%s%s.babi%s" % ( args.network, args.memory_hops, args.dim, args.batch_size, ".na" if args.normalize_attention else "", ".bn" if args.batch_norm else "", (".d" + str(args.dropout)) if args.dropout > 0 else "", args.input_train.split("/")[-1], ) # Go and get the data from the folders; see utils class. train_raw, test_raw = utils.get_raw_data(args.input_train, args.input_test) # Initialize word2vec with utils.load_glove word2vec = utils.load_glove(args.word_vector_size) args_dict = dict(args._get_kwargs()) args_dict["train_raw"] = train_raw args_dict["test_raw"] = test_raw args_dict["word2vec"] = word2vec # init class if args.network == "dmn_batch": import dmn_batch dmn = dmn_batch.DMN_batch(**args_dict)
from lxml import etree from utils import get_raw_data def retrieve_style_text(xml_file, style): xml_tree = etree.parse(xml_file) text = [] locs = [] for elem in xml_tree.iter(): if elem.tag.count('charParams') > 0 and elem.attrib["style"] == style: text.append(elem.text) locs.append([elem.attrib["l"], elem.attrib["t"], elem.attrib["r"], elem.attrib["b"]]) return text, locs if __name__ == '__main__': xml_file = "/home/mahad/abbyy_dummy_dataset/xml/Original Doc_Alpha FDI Holdings Pte. Ltd. (1).xml" results = get_raw_data(xml_file)
# -*- coding: utf-8 -*- import regex as re import json import os from collections import OrderedDict from utils import get_raw_data os.chdir(os.path.dirname(os.path.abspath(__file__))) get_raw_data() # Languages with insufficient translation data are excluded avoid_languages = ['cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo'] def _get_language_locale_dict(): cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/" available_locale_names = os.listdir(cldr_dates_full_dir) available_language_names = [shortname for shortname in available_locale_names if not re.search(r'-[A-Z0-9]+$', shortname)] available_language_names.remove('root') language_locale_dict = {} for language_name in available_language_names: language_locale_dict[language_name] = [] for locale_name in available_locale_names: if re.match(language_name + '-[A-Z0-9]+$', locale_name): language_locale_dict[language_name].append(locale_name) for language in avoid_languages: del language_locale_dict[language] return language_locale_dict
def extract(level_name=None, save_file=None): # Initializing variables filedata = "" script_data = None vvvvvv_dir = None # Get current opsys vvvvvv_dir = utils.get_vvvvvv_dir() # Checks whether level_name specified beforehand (for quiet execution) if not level_name: # request filename from user while True: level_name = None level_name = utils.get_level_name() if not level_name: print "You must enter a level name" continue # get level data raw_data = utils.get_raw_data(vvvvvv_dir, level_name) if not raw_data: print "Error: level does not exist" continue else: break else: raw_data = utils.get_raw_data(vvvvvv_dir, level_name) # get script data script_data = utils.get_script_data(raw_data) if not script_data: print "No script found" quit() final_data = utils.cleanup_data(script_data) print "Done!" # checks if save_file specified beforehand (for quiet execution) if not save_file: cwd = os.getcwd() print print "What file do you wish me to save the data to?" print "Current working directory is: " print print cwd print print "You may enter a filename to save in current directory," print "enter a relative path, or a full path." print print "Else, press return to accept the default, which is: " print print level_name + ".6vscript" print save_file = raw_input("Save file: ") if not save_file: save_file = level_name + ".6vscript" else: pass with open(save_file, 'w') as outfile: for line in final_data: outfile.write(line + '\n') print save_file + " written"
def test_get_raw_data(query_fixture): response = get_raw_data(query_fixture) assert isinstance(response, Response) assert response.ok
def getData(building, zone, date): """Whatever data we get should be stored. date: in PST""" root, dirs, files = os.walk("CacheThanos/").next() Flag = False for index, thefile in enumerate(files, start=1): if str(building) + str(zone) + str(date) + ".dat" == thefile: Flag = True if Flag == False: # get config cfg = utils.get_config(building) zone_cfg = utils.get_zone_config(building, zone) events = [] zone_log = utils.get_zone_formalog(building, zone) if zone_log: for line in zone_log: dateLog = utils.get_mdal_string_to_datetime( line.split(" : ")[0]) dateLog = dateLog.astimezone(pytz.timezone("US/Pacific")) if dateLog.date() == date.date(): events.append((int( (dateLog.replace(tzinfo=None) - date.replace(tzinfo=None)).total_seconds() / 60), line.split(" : ")[1])) interval = cfg["Interval_Length"] # client = utils.choose_client(cfg) client = get_client() start = date.replace(hour=0, minute=0, second=0) end = date.replace(day=date.day + 1, hour=0, minute=0, second=0) # Generate utc times. Use UTC for any archiver getting methods. pst_pytz = pytz.timezone("US/Pacific") start_pst = pst_pytz.localize(start) start_utc = start_pst.astimezone(pytz.timezone("UTC")) end_pst = pst_pytz.localize(end) end_utc = end_pst.astimezone(pytz.timezone("UTC")) datamanager = DataManager(cfg, zone_cfg, client, zone, now=start_utc) # get setpoints ground_truth_setpoints_df = datamanager.thermostat_setpoints( start_utc, end_utc)[zone] # from archiver ground_truth_setpoints_df.index = ground_truth_setpoints_df.index.tz_convert( pst_pytz) config_setpoints_df = datamanager.better_comfortband(start) safety_setpoints_df = datamanager.better_safety(start) config_setpoints = config_setpoints_df[["t_low", "t_high"]].values safety_setpoints = safety_setpoints_df[["t_low", "t_high"]].values # Get tstat and weather data thermal_data_manager = ThermalDataManager(cfg, client) inside_data, outside_data = utils.get_raw_data(building=building, client=client, cfg=cfg, start=start_utc, end=end_utc, force_reload=True) zone_inside_data = inside_data[zone] zone_inside_data.index = zone_inside_data.index.tz_convert(pst_pytz) outside_data = thermal_data_manager._preprocess_outside_data( outside_data.values()) outside_data.index = outside_data.index.tz_convert(pst_pytz) outside_data = outside_data.resample("1T").interpolate() Tin = zone_inside_data["t_in"].values if np.isnan(Tin).any(): print "Warning: Tin contains NaN. Estimates are based on interpolations" nans, x = nan_helper(Tin) Tin[nans] = np.interp(x(nans), x(~nans), Tin[~nans]) # TODO shitty hack # taking the raw data and putting it into a data frame full of nan. Then, interpolating the data to get # data for the whole day. Tout = pd.DataFrame(columns=["t_out"], index=pd.date_range(start=start, end=end, freq="1T")) Tout.index = Tout.index.tz_localize(pst_pytz) Tout["t_out"][outside_data.index[0]:outside_data. index[-1]] = outside_data["t_out"] Tout = Tout.ffill()["t_out"].values[:1440] Policy = zone_inside_data["action"].values # Prepare discomfort discomfortManager = Discomfort(setpoints=config_setpoints) # get occupancies occupancy_config = datamanager.better_occupancy_config(start) try: occupancy_ground = datamanager.occupancy_archiver(start=start, end=end) except: if zone_cfg["Advise"]["Occupancy_Sensors"] == True: print("Warning, could not get ground truth occupancy.") occupancy_ground = None if occupancy_ground is None: occupancy_use = occupancy_config else: occupancy_use = occupancy_ground occupancy_use = occupancy_use["occ"].values discomfort = [] for i in range(len(Tin)): # for the ith minute print len(Tin), len(occupancy_use) assert len(Tin) <= len(occupancy_use) tin = Tin[i] occ = occupancy_use[i] discomfort.append( discomfortManager.disc(t_in=tin, occ=occ, node_time=i, interval=1)) # get consumption and cost and prices prices = datamanager.better_prices(start).values heating_consumption = zone_cfg["Advise"]["Heating_Consumption"] cooling_consumption = zone_cfg["Advise"]["Cooling_Consumption"] energy_manager = EnergyConsumption(prices, interval, now=None, heat=heating_consumption, cool=cooling_consumption) cost = [] for i in range(len(Policy)): # see it as the ith minute. That's why we need the assert assert len(Policy) <= len(prices) action = Policy[i] cost.append(energy_manager.calc_cost(action=action, time=i)) cost = np.array(cost) # Cache the data and check if already downloaded! OPs = occupancy_use[:1440] TinsUPComfortBand = config_setpoints_df["t_high"][:1440] TinsDOWNComfortBand = config_setpoints_df["t_low"][:1440] TinsUPSafety = safety_setpoints_df["t_high"][:1440] TinsDOWNSafety = safety_setpoints_df["t_low"][:1440] TinsUPsp = ground_truth_setpoints_df["t_high"][:1440] TinsDOWNsp = ground_truth_setpoints_df["t_low"][:1440] Costs = cost[:1440] Prices = prices[:1440] Discomforts = discomfort[:1440] temp = OPs, Tin, Tout, Policy, TinsUPComfortBand, TinsDOWNComfortBand, TinsUPSafety, TinsDOWNSafety, TinsUPsp, TinsDOWNsp, Costs, Prices, Discomforts, events, building, zone, date pickle.dump( temp, open( "CacheThanos/" + str(building) + str(zone) + str(Date) + ".dat", "wb")) return temp else: return pickle.load( open( "CacheThanos/" + str(building) + str(zone) + str(date) + ".dat", "rb"))
def get_data(filepath): raw_data = get_raw_data(filepath) preprocessed_data = preprocess_data(raw_data) row_length = len(preprocessed_data[0]) data = df(preprocessed_data) return data, row_length
u'\N{REVERSED PRIME}', # u'\u2035' u'\N{MODIFIER LETTER PRIME}', # u'\u02b9' u'\N{FULLWIDTH APOSTROPHE}', # u'\uff07' ] DATE_ORDER_PATTERN = re.compile( u'([DMY])+\u200f*[-/. \t]*([DMY])+\u200f*[-/. \t]*([DMY])+') RELATIVE_PATTERN = re.compile(r'(?<![\+\-]\s*)\{0\}') DEFAULT_MONTH_PATTERN = re.compile(r'^M?\d+$', re.U) RE_SANITIZE_APOSTROPHE = re.compile(u'|'.join(APOSTROPHE_LOOK_ALIKE_CHARS)) AM_PATTERN = re.compile(r'^\s*[Aa]\s*\.?\s*[Mm]\s*\.?\s*$') PM_PATTERN = re.compile(r'^\s*[Pp]\s*\.?\s*[Mm]\s*\.?\s*$') PARENTHESIS_PATTERN = re.compile(r'[\(\)]') os.chdir(os.path.dirname(os.path.abspath(__file__))) get_raw_data() cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/" def _filter_relative_string(relative_string): return (isinstance(relative_string, six.string_types) and RELATIVE_PATTERN.search(relative_string) and not PARENTHESIS_PATTERN.search(relative_string)) def _filter_month_name(month_name): return not DEFAULT_MONTH_PATTERN.match(month_name) def _retrieve_locale_data(locale):
def get_data(self, filepath): raw_data = get_raw_data(filepath) preprocessed_data = preprocess_data(raw_data) self.row_length = len(preprocessed_data[0]) self.data = df(preprocessed_data)