예제 #1
0
    def load_content(self):
        fll_f = self._location.get_fll_location()
        reader = CSVFileReader(fll_f)
        content = reader.get_content()
        for entry in content:
            entry['Date'] = datetime.strptime(
                "%s %s" % (entry['fll_day'], entry['year']), "%j %Y")

        self._content = content
예제 #2
0
    def _load_submodels(self, model_cache):
        """Loads meta.csv"""
        reader = CSVFileReader(os.path.join(model_cache, 'meta.csv'))
        content = reader.get_content()

        for entry in content:
            for ch in [',', "'", '[', ']']:
                entry['variables'] = entry['variables'].replace(ch, '')

            entry['variables'] = entry['variables'].split(' ')
            entry['year'] = int(entry['year'])

        return content
예제 #3
0
    def parse_records(self, location, correct_records):
        content = CSVFileReader(location.get_harvest_dataf()).get_content()
        for entry in content:
            entry = formats.on_read(entry, self._location)

        # perform corrections and fixes
        if correct_records:
            for entry in content:
                # fix that annoying problem in BSBEC 2011 where they split the
                # harvest
                if location.get_name() == Location.BSBEC and \
                        entry[formats.DATE] == datetime(2011, 7, 19):
                    entry[formats.DATE] = datetime(2011, 7, 4)

                # fix entries without sub samples (2016 harvest, subsampling
                # was done only on the 12 plot harvest, as it is a more
                # accurate moisture measurement)
                if formats.DW_SUB not in entry.keys():
                    match = next(x for x in content
                                 if x[formats.DATE] == entry[formats.DATE]
                                 and x[formats.UID] == entry[formats.UID]
                                 and x[formats.PSEUDO_REP] == 0)

                    ratio = match[formats.DW_SUB] / match[formats.FW_SUB]
                    entry[formats.DW_PLANT] = entry[formats.FW_PLANT] * ratio

        return content
    def load_file(self, fname, data):
        rows = CSVFileReader(fname).get_content()
        scenarios = []
        for row in rows:
            scenario = []
            keys = [
                x for x in row.keys() if x not in ['utility', 'rmse', 'cost']
            ]
            for key in keys:
                entry = dict()
                if key in months:
                    entry['month'] = key
                    entry['value'] = (row[key] == "True")
                else:
                    entry['variable'] = key
                    entry['value'] = (row[key] == "True")

                scenario.append(entry)

            final_scenario = from_solution(scenario, data)
            final_scenario.set_rmse(row['rmse'])
            util = float(row['utility'])

            final_scenario.set_utility(util)
            scenarios.append(final_scenario)

        self._population = scenarios
예제 #5
0
    def read_directory(self, path):
        if self._format == SINGLE_FILE:
            data = CSVFileReader(os.path.join(
                path, "transmission.csv")).get_content()

            for entry in data:
                entry[formats.UID] = entry['Plot']
                entry[formats.PSEUDO_REP] = 0
                entry[formats.MEASUREMENT] = 0
                entry[formats.DATE] = "%s %s" % (entry['Day'], entry['Year'])
                entry[formats.TIME] = None

                del entry['Plot']
                del entry['Year']
                del entry['Day']

                entry = formats.on_read(entry, self._location, "%j %Y")

            return data

        elif self._format == MULTIPLE_FILES:
            # read all txt files in path
            files = os.listdir(path)
            r = re.compile(".*(TXT|txt)$")
            files = filter(r.match, files)
            data = []
            for f in files:
                data += self.read_txt(os.path.join(path, f))

            return data
예제 #6
0
    def get_data(self):
        transmission_data = CSVFileReader(self._file_location).get_content()
        for entry in transmission_data:
            entry['Date'] = datetime.strptime(
                "%s %s" % (entry['Day'], entry['Year']), "%j %Y")
            entry['Day'] = int(entry['Day'])
            entry['Transmission'] = float(entry['Transmission'])

        return transmission_data
예제 #7
0
    def _cache_load(self):
        cache_fname = os.path.join(os.environ.get("HOME"),
                                   ".data_reader.cache")

        if os.path.isfile(cache_fname):
            self._combined_data = CSVFileReader(cache_fname).get_content()
            for entry in self._combined_data:
                entry = formats.on_read(entry, None)

            self._combined_data = self._fill_vals(self._combined_data)

            print("Loaded cache data from %s" % cache_fname)
            return True
        else:
            return False
예제 #8
0
    def load_cm(self, fname):
        content = CSVFileReader(fname).get_content()
        for entry in content:
            entry['probability'] = float(entry['probability'])
            entry['success'] = float(entry['success'])
            if entry['method'] == "CM1":
                entry['function'] = self._cm1
            elif entry['method'] == "CM2":
                entry['function'] = self._cm2
            elif entry['method'] == "CM3":
                entry['function'] = self._cm3
            elif entry['method'] == "CM4":
                entry['function'] = self._cm4
            elif entry['method'] == "CM5":
                entry['function'] = self._cm5
            elif entry['method'] == "CM6":
                entry['function'] = self._cm6

        self._cm_functions = content
예제 #9
0
    def __init__(self, location, t_base):
        self._met_data = CSVFileReader(location.get_met_data()).get_content()
        self._t_base = t_base

        for reading in self._met_data:
            reading[formats.DATE] = datetime.strptime(reading['Date'],
                                                      "%d/%m/%Y")
            reading.pop('Date')

            reading[formats.PAR] = self.parse_float(reading[formats.PAR])
            reading[formats.T_MAX] = self.parse_float(reading[formats.T_MAX])
            reading[formats.T_MIN] = self.parse_float(reading[formats.T_MIN])
            reading[formats.RAINFALL] = \
                self.parse_float(reading[formats.RAINFALL])

            reading[formats.DD] = degree_days(reading[formats.T_MAX],
                                              reading[formats.T_MIN],
                                              self._t_base)

            for key in reading:
                if (reading[key] == "NA"):
                    reading[key] = None
예제 #10
0
    def calc_RUE(self, LER_dict, k_dict, location, LAI):
        met_data = MetDataReaderCSV(location).get_met_data()
        fll_reader = FLLReader(location)
        genotypes = set([x['Genotype'] for x in LER_dict])

        destructive_phenos = CSVFileReader(
            location.get_destr_phenos()).get_content()
        for entry in destructive_phenos:
            entry['Date'] = datetime.strptime(entry['Date'],
                                              "%Y-%m-%d %H:%M:%S UTC")

            try:
                entry['fresh'] = float(
                    entry['Fresh weight above ground material(g)'])
                entry['fresh_sub'] = float(
                    entry['Fresh weight above ground  sub-sample(g)'])
                entry['dry_sub'] = float(
                    entry['dry weight above ground sub-sample(g)'])
            except ValueError:
                try:
                    entry['dry_weight'] = float(
                        entry['dry weight above ground sub-sample(g)'])
                except ValueError:
                    pass
                continue

            if entry['fresh_sub'] == 0.0:
                entry['dry_weight'] = entry['dry_sub']
                continue
            entry['dry_weight'] = entry['fresh'] * (entry['dry_sub'] /
                                                    entry['fresh_sub'])

        destructive_phenos = [
            x for x in destructive_phenos if 'dry_weight' in x
        ]

        #run the simulation per genotype
        RUE = []
        for genotype in genotypes:
            geno_sub = [
                x for x in destructive_phenos if x['Genotype'] == genotype
            ]
            dates = list(set([x['Date'] for x in geno_sub]))
            dates.sort()

            #create data point groups by dates that are close
            #to each other or the same
            groups = []
            group_id = 0
            for date in dates:
                for group in groups:
                    delta = group['Dates'][0] - date
                    days = math.fabs(delta.days)
                    if days and days < 20:
                        group['Dates'].append(date)
                        break
                else:
                    #create new group
                    group = {'id': group_id, 'Dates': [date]}
                    groups.append(group)
                    group_id += 1

            #get the mean dry weight per group
            mean_DW = []
            #add entry for fll day
            fll_date = fll_reader.get_genotype_fll(genotype)
            mean_DW.append({'Date': fll_date, 'Yield': 0.0})

            for group in groups:
                group_phenos = [
                    x for x in geno_sub if x['Date'] in group['Dates']
                ]
                total_dw = 0.0
                for entry in group_phenos:
                    total_dw += entry['dry_weight']

                total_dw /= float(len(group_phenos))

                #correct the group date to the first one in the group
                mean_DW.append({
                    'Date': sorted(group['Dates'])[0],
                    'Yield': total_dw
                })

            #obtain genotype specific coefficients
            LER = [x for x in LER_dict if x['Genotype'] == genotype]
            LER.sort(key=lambda x: x['stage'])
            k = [x for x in k_dict if x['Genotype'] == genotype]
            if len(k) > 1:
                k = next(x['k'] for x in k if x['Year'] == location.get_year())
            else:
                k = sorted(k, key=lambda x: x['Year'])[0]['k']

            #simulate PAR and record values for days of destructive harvests
            real_LAI = [x for x in LAI if x['Genotype'] == genotype]
            mean_DW = self.simulate_PAR(k, LER, met_data, fll_date, mean_DW,
                                        real_LAI)

            #finally work out what the RUE is from
            #the real DMY and simulated PAR values
            temp_file = tempfile.mkstemp()[1] + genotype.split("-")[0]
            CSVFileWriter(temp_file, mean_DW)
            robjects.r('''
				calc_RUE_r <- function(fname){
					data <- read.csv(fname)
					data$Yield <- data$Yield * 2
					fit <- lm(Yield ~ PAR + 0, data = data)
					return(summary(fit)$coefficients[1])
				}
				''')
            calc_RUE_r = robjects.r("calc_RUE_r")
            RUE_val = calc_RUE_r(temp_file)[0]
            RUE.append({'Genotype': genotype, 'RUE': RUE_val})

        return RUE
예제 #11
0
    def __init__(self, data, scenario, root_dir, load=False):
        """data - {'cd_data': [], 'ml_data': []}
        scenario - 'simple_ml' or 'process_ml'
        """

        # hardcoded algorithm variables, could supply them to the
        # constructor if needed
        # self._PSize = 45 TODO real value
        self._PSize = 12

        # weight for previous score entry, when updating the score table
        self._alpha = 0.3

        # self._b = 20 TODO real value
        self._b = 8

        self._proc_count = 4

        # set class variables
        self._variables = [formats.STEM_COUNT, formats.CANOPY_HEIGHT,
                           formats.TRANSMISSION, formats.FLOWERING_SCORE,
                           formats.LEAF_AREA_INDEX, formats.COL,
                           formats.ROW, formats.DD, formats.GENOTYPE,
                           formats.RAINFALL, formats.DOY, formats.PAR]
        self._variables.sort()

        self._scenario = scenario
        self._root_dir = root_dir
        if scenario == "simple_ml":
            self._methods = ['rf', 'knn', 'gbm']
        elif scenario == "compound":
            self._methods = ['NaiveMLProcessModelMemfix', 'GAWinModel']
        else:
            raise Exception("STUB")  # TODO

        self._data = self._hack_data(data)

        self._months = list(set([x[formats.DATE].strftime("%B") for x in
                                self._data['ml_data']]))
        self._months.sort()

        # find maximum RMSE for methods
        self._max_rmse = self._get_max_rmse()

        # DB to contain all solutions ever explored
        self._database = ScatterPhenoScenarioContainer()
        self._score_table = self._empty_score_table()
        if load:
            sc_file = os.path.join(self._root_dir, 'score_table.csv')
            self._score_table = CSVFileReader(sc_file).get_content()
            for entry in self._score_table:
                entry['score'] = float(entry['score'])
                entry['value'] = (entry['value'] == "True")

            db_file = os.path.join(self._root_dir, 'database.csv')
            self._database.load_file(db_file, self._data)
            self._update_score_table()

            self._run_algorithm2()
        else:
            self._run_algorithm()
예제 #12
0
class ScatterPhenoAlgorithm:

    def __init__(self, data, scenario, root_dir, load=False):
        """data - {'cd_data': [], 'ml_data': []}
        scenario - 'simple_ml' or 'process_ml'
        """

        # hardcoded algorithm variables, could supply them to the
        # constructor if needed
        # self._PSize = 45 TODO real value
        self._PSize = 12

        # weight for previous score entry, when updating the score table
        self._alpha = 0.3

        # self._b = 20 TODO real value
        self._b = 8

        self._proc_count = 4

        # set class variables
        self._variables = [formats.STEM_COUNT, formats.CANOPY_HEIGHT,
                           formats.TRANSMISSION, formats.FLOWERING_SCORE,
                           formats.LEAF_AREA_INDEX, formats.COL,
                           formats.ROW, formats.DD, formats.GENOTYPE,
                           formats.RAINFALL, formats.DOY, formats.PAR]
        self._variables.sort()

        self._scenario = scenario
        self._root_dir = root_dir
        if scenario == "simple_ml":
            self._methods = ['rf', 'knn', 'gbm']
        elif scenario == "compound":
            self._methods = ['NaiveMLProcessModelMemfix', 'GAWinModel']
        else:
            raise Exception("STUB")  # TODO

        self._data = self._hack_data(data)

        self._months = list(set([x[formats.DATE].strftime("%B") for x in
                                self._data['ml_data']]))
        self._months.sort()

        # find maximum RMSE for methods
        self._max_rmse = self._get_max_rmse()

        # DB to contain all solutions ever explored
        self._database = ScatterPhenoScenarioContainer()
        self._score_table = self._empty_score_table()
        if load:
            sc_file = os.path.join(self._root_dir, 'score_table.csv')
            self._score_table = CSVFileReader(sc_file).get_content()
            for entry in self._score_table:
                entry['score'] = float(entry['score'])
                entry['value'] = (entry['value'] == "True")

            db_file = os.path.join(self._root_dir, 'database.csv')
            self._database.load_file(db_file, self._data)
            self._update_score_table()

            self._run_algorithm2()
        else:
            self._run_algorithm()

    def _run_algorithm2(self):
        """Algorithm main method"""
        scenario_builder = SPScenarioBuilder(self._data,
                                             self._variables,
                                             process=True)
        scenario_builder.load_cm(os.path.join(self._root_dir,
                                 'cm_functions.csv'))

        # G1
        logging.info("G1")
        population = scenario_builder.g1(self._PSize/3)
        self._update_score_table(population)

        # G2
        logging.info("G2")
        population = self._generator(population, scenario_builder.g2)

        # G3
        logging.info("G3")
        population = self._generator(population, scenario_builder.g3)

        # form ref set from database
        ref_set = self._ref_set_update(self._database)

        self._report(ref_set, scenario_builder)
        # Leaving this here in case I change my mind TODO
        # ref_set = ScatterPhenoScenarioContainer()
        # ref_set.load_file(os.path.join(self._root_dir, 'ref_set.csv'),
        #                   self._data)

        self._main_loop(ref_set, scenario_builder, population)

    def _run_algorithm(self):
        """Algorithm main method"""
        scenario_builder = SPScenarioBuilder(self._data,
                                             self._variables,
                                             process=True)

        # G1
        logging.info("G1")
        start = datetime.now()
        population = scenario_builder.g1(self._PSize/3)
        self._update_score_table(population)
        logging.info("G1 - %s" % (datetime.now() - start))

        # G2
        logging.info("G2")
        start = datetime.now()
        population = self._generator(population, scenario_builder.g2)
        logging.info("G2 - %s" % (datetime.now() - start))

        # G3
        start = datetime.now()
        logging.info("G3")
        population = self._generator(population, scenario_builder.g3)
        logging.info("G3 - %s" % (datetime.now() - start))

        # build ref set
        logging.info("Building ref_set")
        ref_set = self._ref_set_update(population)

        self._report(ref_set, scenario_builder)

        # parallel improvement of the best b/2 solutions
        start = datetime.now()
        logging.info("Improving ref_set")
        ref_set = self._mp_improve(ref_set, scenario_builder)
        logging.info("Improvements - %s" % (datetime.now() - start))

        self._main_loop(ref_set, scenario_builder, population)

    def _main_loop(self, ref_set, scenario_builder, population):
        stop = False
        last_changed = 0
        iteration = 0
        while not stop:
            start_loop = datetime.now()
            # create the pool from combining solutions from ref_set
            logging.info("Performing combinations")
            start = datetime.now()
            pool = self._combine(ref_set, scenario_builder)
            logging.info("Combinations %s" % (datetime.now() - start))

            # improve pool
            logging.info("Improving best combinations")
            start = datetime.now()
            pool = self._mp_improve(pool, scenario_builder)
            logging.info("Improvements %s" % (datetime.now() - start))

            # join ref_set and pool together
            union = deepcopy(ref_set)
            union.add_container(pool)
            union.sort()

            new_ref_set = self._ref_set_update(union)
            if ref_set.same(new_ref_set):
                logging.info("Ref_set not changed")
                new_ref_set = ScatterPhenoScenarioContainer()

                for i in range(self._b/2):
                    new_ref_set.add(union.get(i))

                # get the most diverse solutions to what
                # we already have in ref_set
                while new_ref_set.len() < self._b:
                    new_ref_set.add(population.get_diverse(new_ref_set))

            if ref_set.same(new_ref_set):
                last_changed += 1
            else:
                last_changed = 0

            if last_changed >= 5:
                logging.info("Reached optimal solution, terminating...")
                stop = True

            if os.path.exists('/home/eey9/.stop_scatter_search'):
                stop = True
                logging.info("Stopping because of file flag...")

            ref_set = new_ref_set
            iteration += 1
            logging.info("Completed iteration %d" % iteration)
            self._report(ref_set, scenario_builder)
            t_delta = datetime.now() - start_loop
            logging.info("Iteration time %s" % t_delta)

        return

    def _ref_set_update(self, source):
        source.sort()
        ref_set = ScatterPhenoScenarioContainer()
        for i in range(self._b/2):
            ref_set.add(source.get(i))

        # get the most diverse solutions to what we already have in ref_set
        while ref_set.len() < self._b:
            ref_set.add(source.get_diverse(ref_set))

        return ref_set

    def _combine(self, container, scenario_builder):
        # build subsets
        combinations = self._build_combinations(container)
        pool = ScatterPhenoScenarioContainer()
        for combination in combinations:
            start = datetime.now()
            try:
                new_scenario = scenario_builder.combine(combination[0],
                                                        combination[1],
                                                        self._score_table)
            except NoValidSolutionException:
                logging.info("Combination %d/%d - %s: no valid solution" %
                             (combinations.index(combination) + 1,
                              len(combinations),
                              (datetime.now() - start)))
                continue

            self._update_score_table(new_scenario)
            if not pool.contains(new_scenario):
                pool.add(new_scenario)

            # see where does the scenario qualify to be in container
            try:
                j = container.index(next(x for x in container.get_all()
                                    if new_scenario.get_utility() <
                                    x.get_utility()))
                scenario_builder.success(self._b - j)
            except StopIteration:
                continue  # Worse than anything in ref_set, does not qualify

            logging.info("Combination %d/%d - %s" %
                         (combinations.index(combination) + 1,
                          len(combinations),
                          (datetime.now() - start)))

        return pool

    def _sp_improve(self, container, scenario_builder):
        container.sort()

        best = []
        for i in range(self._b/2):
            best.append(container.get(i))

        result = []
        for scenario in best:
            result.append(self._improve(scenario, scenario_builder))

        for entry in result:
            index = container.index(entry['individual'])
            best = entry['improvements'].get(0)
            if best.get_utility() < entry['individual'].get_utility():
                container.replace(best, index)

            for improvement in entry['improvements'].get_all():
                self._update_score_table(improvement)

        logging.info("Improved %d solutions" % container.get_changes())
        container.reset_changes()
        return container

    def _mp_improve(self, container, scenario_builder):
        """Improves b/2 best solutions from the container and updates
        the score table with the generated solutions
        """
        container.sort()
        pool = Pool(processes=self._proc_count)

        logging.info("Starting processes")
        start = datetime.now()
        best = []
        builders = []
        for i in range(self._b/2):
            best.append(container.get(i))
            builders.append(scenario_builder)

        try:
            result = pool.map(self._improve, best, builders)
            pool.close()
            pool.join()
        except MemoryError as e:
            send_email("I crashed again, please help!")
            import pudb
            pudb.set_trace()
            print(e.message())

        logging.info("Processes finished - %s" % (datetime.now() - start))
        # How infuriating was that?!
        # pathos was being smart and was caching pool so this is needed
        # to prevent from erroring out
        pool.restart()

        start = datetime.now()
        logging.info("mp_improve second loop")
        for entry in result:
            index = container.index(entry['individual'])
            best = entry['improvements'].get(0)
            if best.get_utility() < entry['individual'].get_utility():
                container.replace(best, index)

            for improvement in entry['improvements'].get_all():
                self._update_score_table(improvement)

        logging.info("mp_improve second loop - %s" % (datetime.now() - start))
        logging.info("Improved %d solutions" % container.get_changes())
        container.reset_changes()
        return container

    def _improve(self, individual, scenario_builder):
        start = datetime.now()
        base = importr("base")
        candidate_list = self._build_candidate_list(individual)

        improvements = ScatterPhenoScenarioContainer()

        for var in candidate_list:
            new_scenario = scenario_builder.flip(individual, var)
            if new_scenario.same(individual) or \
                    not new_scenario.valid(process=True):
                continue

            new_scenario = self._evaluate(new_scenario, base)

            if not improvements.contains(new_scenario):
                improvements.add(new_scenario)

        for i in range(len(individual.get_solution())):
            for j in range(i + 1, len(individual.get_solution())):
                new_scenario = scenario_builder.swap(individual, i, j)
                if new_scenario.same(individual) or \
                        not new_scenario.valid(process=True):
                    continue

                new_scenario = self._evaluate(new_scenario, base)
                if not improvements.contains(new_scenario):
                    improvements.add(new_scenario)

                if not self._database.contains(new_scenario):
                    self._database.add(new_scenario)

        improvements.sort()

        logging.info("self._improve finished - %s" %
                     (datetime.now() - start))
        return {'individual': individual,
                'improvements': improvements}

    def _build_candidate_list(self, individual):
        candidate_list = []
        for entry in individual.get_solution():
            t = next(x for x in entry.keys() if x != 'value')
            candidate_list.append(next(x for x in self._score_table
                                       if x['type'] == t and
                                       entry[t] == x['name'] and
                                       entry['value'] != x['value']))

        # smallest score = highest probability so DONT CHANGE THIS
        candidate_list.sort(key=lambda x: x['score'])
        return candidate_list

    def _generator(self, population, func):
        generated = 0
        while generated < self._PSize/3:
            worked = False
            while not worked:
                try:
                    individual = func(self._score_table)
                    population.add(individual)
                    self._update_score_table(individual)
                    generated += 1
                    worked = True
                except ScatterPhenoScenarioContainerException:
                    # already exists
                    pass

        return population

    def _evaluate(self, pheno_scenario, base=None):
        if base is None:
            base = glob_base

        if self._database.contains(pheno_scenario):
            return self._database.request(pheno_scenario)

        model = ScatterPhenoModel(self._data,
                                  base,
                                  pheno_scenario,
                                  self._methods,
                                  self._max_rmse)

        util = (0.3 * pheno_scenario.get_cost()) + model.get_rmse()
        pheno_scenario.set_utility(util)
        pheno_scenario.set_rmse(model.get_absolute_rmse())
        return pheno_scenario

    def _update_score_table(self, input_=None):
        if input_ is None:
            self._score_table = self._calc_table(self._database.get_all())

        elif input_.__class__ == ScatterPhenoScenarioContainer:
            # we are given a population - this should happen after G1
            population = input_.get_all()

            for individual in population:
                if not self._database.contains(individual):
                    self._evaluate(individual)
                    self._database.add(individual)
                else:
                    logging.info("Warning, individual in database, weird!")

            # calculate score based on whole database population
            self._score_table = self._calc_table(self._database.get_all())

        elif input_.__class__ == ScatterPhenoScenario:
            individual = input_
            if self._database.contains(individual):
                # individual already in database, update the individual
                # with the rmse and utility
                index = self._database.index(individual)

                # but only if it needs updating
                if not individual.has_utility():
                    self._database.get(index).copy_to(individual)

                return

            if not individual.is_evaluated():
                self._evaluate(individual)

            # create the table at t
            new_table = self._calc_table(self._database.get_all() +
                                         [individual])

            # use the tables at t and t-1 to calculate the smoothed score
            for entry, new_entry in zip(self._score_table, new_table):
                entry['score'] = (self._alpha * entry['score'] +
                                  (1 - self._alpha) * new_entry['score'])

            self._database.add(individual)

    def _calc_table(self, population):
        new_table = []
        # using self._score_table just for template here,
        # none of the values will be copied
        for entry in self._score_table:
            new_entry = deepcopy(entry)
            index = self._score_table.index(entry)/2

            match = [x for x in population
                     if x.get(index) == entry['value']]
            non_match = [x for x in population
                         if x.get(index) != entry['value']]

            # little hack to get around certain variables not having
            # both values
            if len(match) == 0 or len(non_match) == 0:
                score = 0.5
            else:
                util_match = (sum([x.get_utility() for x in match]) /
                              len(match))
                util_non_match = (sum([x.get_utility()
                                       for x in non_match]) /
                                  len(non_match))

                score = util_match / (util_match + util_non_match)

            new_entry['score'] = score
            new_table.append(new_entry)

        return new_table

    def _get_max_rmse(self):
        rmses = dict()
        for method in self._methods:
            model = MLProcessModel(method, self._data, self._data,
                                   self._variables, glob_base)

            rmses[method] = model._rmse_abs

        return rmses

    def _build_combinations(self, container):
        container.sort()
        population = container.get_all()

        combinations = []
        for i in range(len(population)):
            for j in range(i + 1, len(population)):
                combination = [population[i], population[j]]
                combinations.append(combination)

        return combinations

    def _hack_data(self, data):
        # remove winter months that are useless
        for key in data.keys():
            new_data = [x for x in data[key]
                        if x[formats.DATE].month > 3 and
                        x[formats.DATE].month < 11]
            data[key] = new_data

        # remove records with missing values for needed variables
        needed_vars = deepcopy(self._variables)

        ml_data = []
        for entry in data['ml_data']:
            add = True
            for key in needed_vars:
                if entry[key] is None:
                    add = False
                    break

            if add:
                ml_data.append(entry)

        data['ml_data'] = ml_data
        return data

    def _empty_score_table(self):
        score_table = []
        for month in self._months:
            for val in [False, True]:
                entry = dict()
                entry['name'] = month
                entry['type'] = 'month'
                entry['value'] = val
                entry['score'] = 0
                score_table.append(entry)

        for var in self._variables:
            for val in [False, True]:
                entry = dict()
                entry['name'] = var
                entry['type'] = 'variable'
                entry['value'] = val
                entry['score'] = 0
                score_table.append(entry)

        return score_table

    def _report(self, ref_set, sb):
        self._to_csv(os.path.join(self._root_dir, 'ref_set.csv'), ref_set)
        CSVFileWriter(os.path.join(self._root_dir, 'score_table.csv'),
                      self._score_table)
        self._to_csv(os.path.join(self._root_dir, 'database.csv'),
                     self._database)

        # delete the function reference from the dict and save the info to file
        func = deepcopy(sb._cm_functions)
        for f in func:
            del f['function']

        CSVFileWriter(os.path.join(self._root_dir, 'cm_functions.csv'), func)

        # report memory usage
        mem_report = [{'date': datetime.now().strftime("%H:%M:%S %d/%m/%Y"),
                       'mem': memory()/float(1024**2),
                       'res_memory': res_memory(),
                       'total_memory': total_memory()}]

        memfile = os.path.join(self._root_dir, 'memfile.csv')
        if not os.path.exists(memfile):
            CSVFileWriter(memfile, mem_report)
        else:
            CSVFileWriter(memfile, mem_report, write_mode='a')

    def _to_csv(self, fname, container):
        data = [x.to_dict() for x in container.get_all()]
        CSVFileWriter(fname, data)
예제 #13
0
 def parse_file(self, filename):
     content = CSVFileReader(filename).get_content()
     content = self.parse_content(content)
     return content