def load_content(self): fll_f = self._location.get_fll_location() reader = CSVFileReader(fll_f) content = reader.get_content() for entry in content: entry['Date'] = datetime.strptime( "%s %s" % (entry['fll_day'], entry['year']), "%j %Y") self._content = content
def _load_submodels(self, model_cache): """Loads meta.csv""" reader = CSVFileReader(os.path.join(model_cache, 'meta.csv')) content = reader.get_content() for entry in content: for ch in [',', "'", '[', ']']: entry['variables'] = entry['variables'].replace(ch, '') entry['variables'] = entry['variables'].split(' ') entry['year'] = int(entry['year']) return content
def parse_records(self, location, correct_records): content = CSVFileReader(location.get_harvest_dataf()).get_content() for entry in content: entry = formats.on_read(entry, self._location) # perform corrections and fixes if correct_records: for entry in content: # fix that annoying problem in BSBEC 2011 where they split the # harvest if location.get_name() == Location.BSBEC and \ entry[formats.DATE] == datetime(2011, 7, 19): entry[formats.DATE] = datetime(2011, 7, 4) # fix entries without sub samples (2016 harvest, subsampling # was done only on the 12 plot harvest, as it is a more # accurate moisture measurement) if formats.DW_SUB not in entry.keys(): match = next(x for x in content if x[formats.DATE] == entry[formats.DATE] and x[formats.UID] == entry[formats.UID] and x[formats.PSEUDO_REP] == 0) ratio = match[formats.DW_SUB] / match[formats.FW_SUB] entry[formats.DW_PLANT] = entry[formats.FW_PLANT] * ratio return content
def load_file(self, fname, data): rows = CSVFileReader(fname).get_content() scenarios = [] for row in rows: scenario = [] keys = [ x for x in row.keys() if x not in ['utility', 'rmse', 'cost'] ] for key in keys: entry = dict() if key in months: entry['month'] = key entry['value'] = (row[key] == "True") else: entry['variable'] = key entry['value'] = (row[key] == "True") scenario.append(entry) final_scenario = from_solution(scenario, data) final_scenario.set_rmse(row['rmse']) util = float(row['utility']) final_scenario.set_utility(util) scenarios.append(final_scenario) self._population = scenarios
def read_directory(self, path): if self._format == SINGLE_FILE: data = CSVFileReader(os.path.join( path, "transmission.csv")).get_content() for entry in data: entry[formats.UID] = entry['Plot'] entry[formats.PSEUDO_REP] = 0 entry[formats.MEASUREMENT] = 0 entry[formats.DATE] = "%s %s" % (entry['Day'], entry['Year']) entry[formats.TIME] = None del entry['Plot'] del entry['Year'] del entry['Day'] entry = formats.on_read(entry, self._location, "%j %Y") return data elif self._format == MULTIPLE_FILES: # read all txt files in path files = os.listdir(path) r = re.compile(".*(TXT|txt)$") files = filter(r.match, files) data = [] for f in files: data += self.read_txt(os.path.join(path, f)) return data
def get_data(self): transmission_data = CSVFileReader(self._file_location).get_content() for entry in transmission_data: entry['Date'] = datetime.strptime( "%s %s" % (entry['Day'], entry['Year']), "%j %Y") entry['Day'] = int(entry['Day']) entry['Transmission'] = float(entry['Transmission']) return transmission_data
def _cache_load(self): cache_fname = os.path.join(os.environ.get("HOME"), ".data_reader.cache") if os.path.isfile(cache_fname): self._combined_data = CSVFileReader(cache_fname).get_content() for entry in self._combined_data: entry = formats.on_read(entry, None) self._combined_data = self._fill_vals(self._combined_data) print("Loaded cache data from %s" % cache_fname) return True else: return False
def load_cm(self, fname): content = CSVFileReader(fname).get_content() for entry in content: entry['probability'] = float(entry['probability']) entry['success'] = float(entry['success']) if entry['method'] == "CM1": entry['function'] = self._cm1 elif entry['method'] == "CM2": entry['function'] = self._cm2 elif entry['method'] == "CM3": entry['function'] = self._cm3 elif entry['method'] == "CM4": entry['function'] = self._cm4 elif entry['method'] == "CM5": entry['function'] = self._cm5 elif entry['method'] == "CM6": entry['function'] = self._cm6 self._cm_functions = content
def __init__(self, location, t_base): self._met_data = CSVFileReader(location.get_met_data()).get_content() self._t_base = t_base for reading in self._met_data: reading[formats.DATE] = datetime.strptime(reading['Date'], "%d/%m/%Y") reading.pop('Date') reading[formats.PAR] = self.parse_float(reading[formats.PAR]) reading[formats.T_MAX] = self.parse_float(reading[formats.T_MAX]) reading[formats.T_MIN] = self.parse_float(reading[formats.T_MIN]) reading[formats.RAINFALL] = \ self.parse_float(reading[formats.RAINFALL]) reading[formats.DD] = degree_days(reading[formats.T_MAX], reading[formats.T_MIN], self._t_base) for key in reading: if (reading[key] == "NA"): reading[key] = None
def calc_RUE(self, LER_dict, k_dict, location, LAI): met_data = MetDataReaderCSV(location).get_met_data() fll_reader = FLLReader(location) genotypes = set([x['Genotype'] for x in LER_dict]) destructive_phenos = CSVFileReader( location.get_destr_phenos()).get_content() for entry in destructive_phenos: entry['Date'] = datetime.strptime(entry['Date'], "%Y-%m-%d %H:%M:%S UTC") try: entry['fresh'] = float( entry['Fresh weight above ground material(g)']) entry['fresh_sub'] = float( entry['Fresh weight above ground sub-sample(g)']) entry['dry_sub'] = float( entry['dry weight above ground sub-sample(g)']) except ValueError: try: entry['dry_weight'] = float( entry['dry weight above ground sub-sample(g)']) except ValueError: pass continue if entry['fresh_sub'] == 0.0: entry['dry_weight'] = entry['dry_sub'] continue entry['dry_weight'] = entry['fresh'] * (entry['dry_sub'] / entry['fresh_sub']) destructive_phenos = [ x for x in destructive_phenos if 'dry_weight' in x ] #run the simulation per genotype RUE = [] for genotype in genotypes: geno_sub = [ x for x in destructive_phenos if x['Genotype'] == genotype ] dates = list(set([x['Date'] for x in geno_sub])) dates.sort() #create data point groups by dates that are close #to each other or the same groups = [] group_id = 0 for date in dates: for group in groups: delta = group['Dates'][0] - date days = math.fabs(delta.days) if days and days < 20: group['Dates'].append(date) break else: #create new group group = {'id': group_id, 'Dates': [date]} groups.append(group) group_id += 1 #get the mean dry weight per group mean_DW = [] #add entry for fll day fll_date = fll_reader.get_genotype_fll(genotype) mean_DW.append({'Date': fll_date, 'Yield': 0.0}) for group in groups: group_phenos = [ x for x in geno_sub if x['Date'] in group['Dates'] ] total_dw = 0.0 for entry in group_phenos: total_dw += entry['dry_weight'] total_dw /= float(len(group_phenos)) #correct the group date to the first one in the group mean_DW.append({ 'Date': sorted(group['Dates'])[0], 'Yield': total_dw }) #obtain genotype specific coefficients LER = [x for x in LER_dict if x['Genotype'] == genotype] LER.sort(key=lambda x: x['stage']) k = [x for x in k_dict if x['Genotype'] == genotype] if len(k) > 1: k = next(x['k'] for x in k if x['Year'] == location.get_year()) else: k = sorted(k, key=lambda x: x['Year'])[0]['k'] #simulate PAR and record values for days of destructive harvests real_LAI = [x for x in LAI if x['Genotype'] == genotype] mean_DW = self.simulate_PAR(k, LER, met_data, fll_date, mean_DW, real_LAI) #finally work out what the RUE is from #the real DMY and simulated PAR values temp_file = tempfile.mkstemp()[1] + genotype.split("-")[0] CSVFileWriter(temp_file, mean_DW) robjects.r(''' calc_RUE_r <- function(fname){ data <- read.csv(fname) data$Yield <- data$Yield * 2 fit <- lm(Yield ~ PAR + 0, data = data) return(summary(fit)$coefficients[1]) } ''') calc_RUE_r = robjects.r("calc_RUE_r") RUE_val = calc_RUE_r(temp_file)[0] RUE.append({'Genotype': genotype, 'RUE': RUE_val}) return RUE
def __init__(self, data, scenario, root_dir, load=False): """data - {'cd_data': [], 'ml_data': []} scenario - 'simple_ml' or 'process_ml' """ # hardcoded algorithm variables, could supply them to the # constructor if needed # self._PSize = 45 TODO real value self._PSize = 12 # weight for previous score entry, when updating the score table self._alpha = 0.3 # self._b = 20 TODO real value self._b = 8 self._proc_count = 4 # set class variables self._variables = [formats.STEM_COUNT, formats.CANOPY_HEIGHT, formats.TRANSMISSION, formats.FLOWERING_SCORE, formats.LEAF_AREA_INDEX, formats.COL, formats.ROW, formats.DD, formats.GENOTYPE, formats.RAINFALL, formats.DOY, formats.PAR] self._variables.sort() self._scenario = scenario self._root_dir = root_dir if scenario == "simple_ml": self._methods = ['rf', 'knn', 'gbm'] elif scenario == "compound": self._methods = ['NaiveMLProcessModelMemfix', 'GAWinModel'] else: raise Exception("STUB") # TODO self._data = self._hack_data(data) self._months = list(set([x[formats.DATE].strftime("%B") for x in self._data['ml_data']])) self._months.sort() # find maximum RMSE for methods self._max_rmse = self._get_max_rmse() # DB to contain all solutions ever explored self._database = ScatterPhenoScenarioContainer() self._score_table = self._empty_score_table() if load: sc_file = os.path.join(self._root_dir, 'score_table.csv') self._score_table = CSVFileReader(sc_file).get_content() for entry in self._score_table: entry['score'] = float(entry['score']) entry['value'] = (entry['value'] == "True") db_file = os.path.join(self._root_dir, 'database.csv') self._database.load_file(db_file, self._data) self._update_score_table() self._run_algorithm2() else: self._run_algorithm()
class ScatterPhenoAlgorithm: def __init__(self, data, scenario, root_dir, load=False): """data - {'cd_data': [], 'ml_data': []} scenario - 'simple_ml' or 'process_ml' """ # hardcoded algorithm variables, could supply them to the # constructor if needed # self._PSize = 45 TODO real value self._PSize = 12 # weight for previous score entry, when updating the score table self._alpha = 0.3 # self._b = 20 TODO real value self._b = 8 self._proc_count = 4 # set class variables self._variables = [formats.STEM_COUNT, formats.CANOPY_HEIGHT, formats.TRANSMISSION, formats.FLOWERING_SCORE, formats.LEAF_AREA_INDEX, formats.COL, formats.ROW, formats.DD, formats.GENOTYPE, formats.RAINFALL, formats.DOY, formats.PAR] self._variables.sort() self._scenario = scenario self._root_dir = root_dir if scenario == "simple_ml": self._methods = ['rf', 'knn', 'gbm'] elif scenario == "compound": self._methods = ['NaiveMLProcessModelMemfix', 'GAWinModel'] else: raise Exception("STUB") # TODO self._data = self._hack_data(data) self._months = list(set([x[formats.DATE].strftime("%B") for x in self._data['ml_data']])) self._months.sort() # find maximum RMSE for methods self._max_rmse = self._get_max_rmse() # DB to contain all solutions ever explored self._database = ScatterPhenoScenarioContainer() self._score_table = self._empty_score_table() if load: sc_file = os.path.join(self._root_dir, 'score_table.csv') self._score_table = CSVFileReader(sc_file).get_content() for entry in self._score_table: entry['score'] = float(entry['score']) entry['value'] = (entry['value'] == "True") db_file = os.path.join(self._root_dir, 'database.csv') self._database.load_file(db_file, self._data) self._update_score_table() self._run_algorithm2() else: self._run_algorithm() def _run_algorithm2(self): """Algorithm main method""" scenario_builder = SPScenarioBuilder(self._data, self._variables, process=True) scenario_builder.load_cm(os.path.join(self._root_dir, 'cm_functions.csv')) # G1 logging.info("G1") population = scenario_builder.g1(self._PSize/3) self._update_score_table(population) # G2 logging.info("G2") population = self._generator(population, scenario_builder.g2) # G3 logging.info("G3") population = self._generator(population, scenario_builder.g3) # form ref set from database ref_set = self._ref_set_update(self._database) self._report(ref_set, scenario_builder) # Leaving this here in case I change my mind TODO # ref_set = ScatterPhenoScenarioContainer() # ref_set.load_file(os.path.join(self._root_dir, 'ref_set.csv'), # self._data) self._main_loop(ref_set, scenario_builder, population) def _run_algorithm(self): """Algorithm main method""" scenario_builder = SPScenarioBuilder(self._data, self._variables, process=True) # G1 logging.info("G1") start = datetime.now() population = scenario_builder.g1(self._PSize/3) self._update_score_table(population) logging.info("G1 - %s" % (datetime.now() - start)) # G2 logging.info("G2") start = datetime.now() population = self._generator(population, scenario_builder.g2) logging.info("G2 - %s" % (datetime.now() - start)) # G3 start = datetime.now() logging.info("G3") population = self._generator(population, scenario_builder.g3) logging.info("G3 - %s" % (datetime.now() - start)) # build ref set logging.info("Building ref_set") ref_set = self._ref_set_update(population) self._report(ref_set, scenario_builder) # parallel improvement of the best b/2 solutions start = datetime.now() logging.info("Improving ref_set") ref_set = self._mp_improve(ref_set, scenario_builder) logging.info("Improvements - %s" % (datetime.now() - start)) self._main_loop(ref_set, scenario_builder, population) def _main_loop(self, ref_set, scenario_builder, population): stop = False last_changed = 0 iteration = 0 while not stop: start_loop = datetime.now() # create the pool from combining solutions from ref_set logging.info("Performing combinations") start = datetime.now() pool = self._combine(ref_set, scenario_builder) logging.info("Combinations %s" % (datetime.now() - start)) # improve pool logging.info("Improving best combinations") start = datetime.now() pool = self._mp_improve(pool, scenario_builder) logging.info("Improvements %s" % (datetime.now() - start)) # join ref_set and pool together union = deepcopy(ref_set) union.add_container(pool) union.sort() new_ref_set = self._ref_set_update(union) if ref_set.same(new_ref_set): logging.info("Ref_set not changed") new_ref_set = ScatterPhenoScenarioContainer() for i in range(self._b/2): new_ref_set.add(union.get(i)) # get the most diverse solutions to what # we already have in ref_set while new_ref_set.len() < self._b: new_ref_set.add(population.get_diverse(new_ref_set)) if ref_set.same(new_ref_set): last_changed += 1 else: last_changed = 0 if last_changed >= 5: logging.info("Reached optimal solution, terminating...") stop = True if os.path.exists('/home/eey9/.stop_scatter_search'): stop = True logging.info("Stopping because of file flag...") ref_set = new_ref_set iteration += 1 logging.info("Completed iteration %d" % iteration) self._report(ref_set, scenario_builder) t_delta = datetime.now() - start_loop logging.info("Iteration time %s" % t_delta) return def _ref_set_update(self, source): source.sort() ref_set = ScatterPhenoScenarioContainer() for i in range(self._b/2): ref_set.add(source.get(i)) # get the most diverse solutions to what we already have in ref_set while ref_set.len() < self._b: ref_set.add(source.get_diverse(ref_set)) return ref_set def _combine(self, container, scenario_builder): # build subsets combinations = self._build_combinations(container) pool = ScatterPhenoScenarioContainer() for combination in combinations: start = datetime.now() try: new_scenario = scenario_builder.combine(combination[0], combination[1], self._score_table) except NoValidSolutionException: logging.info("Combination %d/%d - %s: no valid solution" % (combinations.index(combination) + 1, len(combinations), (datetime.now() - start))) continue self._update_score_table(new_scenario) if not pool.contains(new_scenario): pool.add(new_scenario) # see where does the scenario qualify to be in container try: j = container.index(next(x for x in container.get_all() if new_scenario.get_utility() < x.get_utility())) scenario_builder.success(self._b - j) except StopIteration: continue # Worse than anything in ref_set, does not qualify logging.info("Combination %d/%d - %s" % (combinations.index(combination) + 1, len(combinations), (datetime.now() - start))) return pool def _sp_improve(self, container, scenario_builder): container.sort() best = [] for i in range(self._b/2): best.append(container.get(i)) result = [] for scenario in best: result.append(self._improve(scenario, scenario_builder)) for entry in result: index = container.index(entry['individual']) best = entry['improvements'].get(0) if best.get_utility() < entry['individual'].get_utility(): container.replace(best, index) for improvement in entry['improvements'].get_all(): self._update_score_table(improvement) logging.info("Improved %d solutions" % container.get_changes()) container.reset_changes() return container def _mp_improve(self, container, scenario_builder): """Improves b/2 best solutions from the container and updates the score table with the generated solutions """ container.sort() pool = Pool(processes=self._proc_count) logging.info("Starting processes") start = datetime.now() best = [] builders = [] for i in range(self._b/2): best.append(container.get(i)) builders.append(scenario_builder) try: result = pool.map(self._improve, best, builders) pool.close() pool.join() except MemoryError as e: send_email("I crashed again, please help!") import pudb pudb.set_trace() print(e.message()) logging.info("Processes finished - %s" % (datetime.now() - start)) # How infuriating was that?! # pathos was being smart and was caching pool so this is needed # to prevent from erroring out pool.restart() start = datetime.now() logging.info("mp_improve second loop") for entry in result: index = container.index(entry['individual']) best = entry['improvements'].get(0) if best.get_utility() < entry['individual'].get_utility(): container.replace(best, index) for improvement in entry['improvements'].get_all(): self._update_score_table(improvement) logging.info("mp_improve second loop - %s" % (datetime.now() - start)) logging.info("Improved %d solutions" % container.get_changes()) container.reset_changes() return container def _improve(self, individual, scenario_builder): start = datetime.now() base = importr("base") candidate_list = self._build_candidate_list(individual) improvements = ScatterPhenoScenarioContainer() for var in candidate_list: new_scenario = scenario_builder.flip(individual, var) if new_scenario.same(individual) or \ not new_scenario.valid(process=True): continue new_scenario = self._evaluate(new_scenario, base) if not improvements.contains(new_scenario): improvements.add(new_scenario) for i in range(len(individual.get_solution())): for j in range(i + 1, len(individual.get_solution())): new_scenario = scenario_builder.swap(individual, i, j) if new_scenario.same(individual) or \ not new_scenario.valid(process=True): continue new_scenario = self._evaluate(new_scenario, base) if not improvements.contains(new_scenario): improvements.add(new_scenario) if not self._database.contains(new_scenario): self._database.add(new_scenario) improvements.sort() logging.info("self._improve finished - %s" % (datetime.now() - start)) return {'individual': individual, 'improvements': improvements} def _build_candidate_list(self, individual): candidate_list = [] for entry in individual.get_solution(): t = next(x for x in entry.keys() if x != 'value') candidate_list.append(next(x for x in self._score_table if x['type'] == t and entry[t] == x['name'] and entry['value'] != x['value'])) # smallest score = highest probability so DONT CHANGE THIS candidate_list.sort(key=lambda x: x['score']) return candidate_list def _generator(self, population, func): generated = 0 while generated < self._PSize/3: worked = False while not worked: try: individual = func(self._score_table) population.add(individual) self._update_score_table(individual) generated += 1 worked = True except ScatterPhenoScenarioContainerException: # already exists pass return population def _evaluate(self, pheno_scenario, base=None): if base is None: base = glob_base if self._database.contains(pheno_scenario): return self._database.request(pheno_scenario) model = ScatterPhenoModel(self._data, base, pheno_scenario, self._methods, self._max_rmse) util = (0.3 * pheno_scenario.get_cost()) + model.get_rmse() pheno_scenario.set_utility(util) pheno_scenario.set_rmse(model.get_absolute_rmse()) return pheno_scenario def _update_score_table(self, input_=None): if input_ is None: self._score_table = self._calc_table(self._database.get_all()) elif input_.__class__ == ScatterPhenoScenarioContainer: # we are given a population - this should happen after G1 population = input_.get_all() for individual in population: if not self._database.contains(individual): self._evaluate(individual) self._database.add(individual) else: logging.info("Warning, individual in database, weird!") # calculate score based on whole database population self._score_table = self._calc_table(self._database.get_all()) elif input_.__class__ == ScatterPhenoScenario: individual = input_ if self._database.contains(individual): # individual already in database, update the individual # with the rmse and utility index = self._database.index(individual) # but only if it needs updating if not individual.has_utility(): self._database.get(index).copy_to(individual) return if not individual.is_evaluated(): self._evaluate(individual) # create the table at t new_table = self._calc_table(self._database.get_all() + [individual]) # use the tables at t and t-1 to calculate the smoothed score for entry, new_entry in zip(self._score_table, new_table): entry['score'] = (self._alpha * entry['score'] + (1 - self._alpha) * new_entry['score']) self._database.add(individual) def _calc_table(self, population): new_table = [] # using self._score_table just for template here, # none of the values will be copied for entry in self._score_table: new_entry = deepcopy(entry) index = self._score_table.index(entry)/2 match = [x for x in population if x.get(index) == entry['value']] non_match = [x for x in population if x.get(index) != entry['value']] # little hack to get around certain variables not having # both values if len(match) == 0 or len(non_match) == 0: score = 0.5 else: util_match = (sum([x.get_utility() for x in match]) / len(match)) util_non_match = (sum([x.get_utility() for x in non_match]) / len(non_match)) score = util_match / (util_match + util_non_match) new_entry['score'] = score new_table.append(new_entry) return new_table def _get_max_rmse(self): rmses = dict() for method in self._methods: model = MLProcessModel(method, self._data, self._data, self._variables, glob_base) rmses[method] = model._rmse_abs return rmses def _build_combinations(self, container): container.sort() population = container.get_all() combinations = [] for i in range(len(population)): for j in range(i + 1, len(population)): combination = [population[i], population[j]] combinations.append(combination) return combinations def _hack_data(self, data): # remove winter months that are useless for key in data.keys(): new_data = [x for x in data[key] if x[formats.DATE].month > 3 and x[formats.DATE].month < 11] data[key] = new_data # remove records with missing values for needed variables needed_vars = deepcopy(self._variables) ml_data = [] for entry in data['ml_data']: add = True for key in needed_vars: if entry[key] is None: add = False break if add: ml_data.append(entry) data['ml_data'] = ml_data return data def _empty_score_table(self): score_table = [] for month in self._months: for val in [False, True]: entry = dict() entry['name'] = month entry['type'] = 'month' entry['value'] = val entry['score'] = 0 score_table.append(entry) for var in self._variables: for val in [False, True]: entry = dict() entry['name'] = var entry['type'] = 'variable' entry['value'] = val entry['score'] = 0 score_table.append(entry) return score_table def _report(self, ref_set, sb): self._to_csv(os.path.join(self._root_dir, 'ref_set.csv'), ref_set) CSVFileWriter(os.path.join(self._root_dir, 'score_table.csv'), self._score_table) self._to_csv(os.path.join(self._root_dir, 'database.csv'), self._database) # delete the function reference from the dict and save the info to file func = deepcopy(sb._cm_functions) for f in func: del f['function'] CSVFileWriter(os.path.join(self._root_dir, 'cm_functions.csv'), func) # report memory usage mem_report = [{'date': datetime.now().strftime("%H:%M:%S %d/%m/%Y"), 'mem': memory()/float(1024**2), 'res_memory': res_memory(), 'total_memory': total_memory()}] memfile = os.path.join(self._root_dir, 'memfile.csv') if not os.path.exists(memfile): CSVFileWriter(memfile, mem_report) else: CSVFileWriter(memfile, mem_report, write_mode='a') def _to_csv(self, fname, container): data = [x.to_dict() for x in container.get_all()] CSVFileWriter(fname, data)
def parse_file(self, filename): content = CSVFileReader(filename).get_content() content = self.parse_content(content) return content