예제 #1
0
class SampledTrainer(Trainer):
    def __init__(self):
        parser = create_parser(usage)
        super().__init__(parser)
        if self.args.invert_samples:
            parser.error('--invert-samples should be left blank')
        self.args.samples_file = (self.args.samples_file
                                  or '{model_base}.samples.json').format(
                                      model_base=self.model_base)
        self.samples, self.hash_to_ind = self.load_sample_data(
            self.args.samples_file, self.train)
        self.metrics_fiti = Fitipy(self.model_base + '.logs',
                                   'sampling-metrics.txt')

    def write_sampling_metrics(self, predicted):
        correct = float(
            sum((predicted > 0.5) == (self.train[1] > 0.5)) /
            len(self.train[1]))
        print('Successfully calculated: {0:.3%}'.format(correct))

        lines = self.metrics_fiti.read().lines()
        lines.append('{}\t{}'.format(
            len(self.samples) / len(self.train[1]), correct))
        self.metrics_fiti.write().lines(lines)

    def choose_new_samples(self, predicted):
        failed_samples = {
            calc_sample_hash(inp, target)
            for i, (inp, pred, target) in enumerate(
                zip(self.train[0], predicted, self.train[1]))
            if (pred > 0.5) != (target > 0.5)
        }
        remaining_failed_samples = failed_samples - self.samples
        print('Remaining failed samples:', len(remaining_failed_samples))
        return islice(remaining_failed_samples, self.args.num_sample_chunk)

    def run(self):
        print('Writing to:', self.args.samples_file)
        print('Writing metrics to:', self.metrics_fiti.path)
        for _ in range(self.args.cycles):
            print('Calculating on whole dataset...')
            predicted = self.model.predict(self.train[0])

            self.samples.update(self.choose_new_samples(predicted))
            Fitipy(self.args.samples_file).write().set(self.samples)
            print('Added', self.args.num_sample_chunk, 'samples')

            self.write_sampling_metrics(predicted)

            self.model.fit(*self.sampled_data,
                           batch_size=self.args.batch_size,
                           epochs=self.epoch + self.args.epochs,
                           callbacks=self.callbacks,
                           initial_epoch=self.epoch,
                           validation_data=self.test)
예제 #2
0
class FilesystemService(ServicePlugin):
    def __init__(self, rt, root=None):
        ServicePlugin.__init__(self, rt)
        self.root = root or expanduser(rt.paths.user_config)
        self.fiti = Fitipy(self.root)

        if not self.isdir(''):
            self.mkdir('')

    def read(self, *path) -> FitiReader:
        return self.fiti.read(*path)

    def write(self, *path) -> FitiWriter:
        return self.fiti.write(*path)

    def subdir(self, *path):
        return FilesystemService(self.rt, join(self.root, *path))

    def open(self, *path, mode='r'):
        return open(join(self.root, *path), mode)

    def isfile(self, *path):
        return isfile(join(self.root, *path))

    def isdir(self, *path):
        return isdir(join(self.root, *path))

    def mkdir(self, *path):
        makedirs(self.path(*path), exist_ok=True)

    def path(self, *path):
        return join(self.root, *path)
class SavedJson:
    """Dict that saves to disk on modifications"""

    def __init__(self, filename):
        super().__init__()
        self.filename = filename
        self.fiti = Fitipy(self.filename)
        self.data = self.fiti.read().dict()

    def __getitem__(self, item):
        return self.data[item]

    def __setitem__(self, key, value):
        changed = self.data.get(key, ...) != value
        self.data[key] = value
        if changed:
            self.fiti.write().dict(self.data)

    def __delitem__(self, key):
        del self.data[key]
        self.fiti.write().dict(self.data)

    def get(self, k):
        return self.data.get(k)

    def update(self, data=None, **kwargs):
        self.data.update(data, **kwargs)
        self.fiti.write().dict(self.data)
예제 #4
0
class TrainSampledScript(TrainScript):
    usage = Usage('''
        Train a model, sampling data points with the highest loss from a larger dataset

        :-c --cycles int 200
            Number of sampling cycles of size {epoch} to run

        :-n --num-sample-chunk int 50
            Number of new samples to introduce at a time between training cycles

        :-sf --samples-file str -
            Json file to write selected samples to.
            Default = {model_base}.samples.json

        :-is --invert-samples
            Unused parameter
        ...
    ''') | TrainScript.usage

    def __init__(self, args):
        super().__init__(args)
        if self.args.invert_samples:
            raise ValueError('--invert-samples should be left blank')
        self.args.samples_file = (self.args.samples_file
                                  or '{model_base}.samples.json').format(
                                      model_base=self.model_base)
        self.samples, self.hash_to_ind = self.load_sample_data(
            self.args.samples_file, self.train)
        self.metrics_fiti = Fitipy(self.model_base + '.logs',
                                   'sampling-metrics.txt')

    def write_sampling_metrics(self, predicted):
        correct = float(
            sum((predicted > 0.5) == (self.train[1] > 0.5)) /
            len(self.train[1]))
        print('Successfully calculated: {0:.3%}'.format(correct))

        lines = self.metrics_fiti.read().lines()
        lines.append('{}\t{}'.format(
            len(self.samples) / len(self.train[1]), correct))
        self.metrics_fiti.write().lines(lines)

    def choose_new_samples(self, predicted):
        failed_samples = {
            calc_sample_hash(inp, target)
            for i, (inp, pred, target) in enumerate(
                zip(self.train[0], predicted, self.train[1]))
            if (pred > 0.5) != (target > 0.5)
        }
        remaining_failed_samples = failed_samples - self.samples
        print('Remaining failed samples:', len(remaining_failed_samples))
        return islice(remaining_failed_samples, self.args.num_sample_chunk)

    def run(self):
        print('Writing to:', self.args.samples_file)
        print('Writing metrics to:', self.metrics_fiti.path)
        for _ in range(self.args.cycles):
            print('Calculating on whole dataset...')
            predicted = self.model.predict(self.train[0])

            self.samples.update(self.choose_new_samples(predicted))
            Fitipy(self.args.samples_file).write().set(self.samples)
            print('Added', self.args.num_sample_chunk, 'samples')

            self.write_sampling_metrics(predicted)

            self.model.fit(*self.sampled_data,
                           batch_size=self.args.batch_size,
                           epochs=self.epoch + self.args.epochs,
                           callbacks=self.callbacks,
                           initial_epoch=self.epoch,
                           validation_data=self.test)
예제 #5
0
def main():
    args = create_parser(usage).parse_args()
    num_seen_file = Fitipy(args.cache_file + '.num')
    topics_cache = args.cache_file + '.topics.json'
    if not isfile(topics_cache):
        print('Generating topics...')
        with open(topics_cache, 'w') as f:
            json.dump(get_keywords_uiuc(), f)
    with open(topics_cache) as f:
        topics = json.load(f)

    num_seen = num_seen_file.read().read(0, int)
    with open(args.auth_file) as f:
        auth = yaml.load(f)
    email = auth['username']
    password = auth['password']
    server = auth.get('pop3_host', 'pop3.' + email.split('@')[-1])
    client = StatelessClass(EmailReceiver,
                            email=email,
                            password=password,
                            server=server)  # type: EmailReceiver

    print('Waiting for emails...')
    while True:
        num_messages = len(client.get_list())
        if num_messages < num_seen:
            num_seen = num_messages
            num_seen_file.write().write(num_seen)
        if num_messages <= num_seen:
            time.sleep(1)
            continue
        for msg_id in range(num_seen + 1, num_messages + 1):
            email = client.get_email(msg_id)
            print('Found new email from {} titled {}.'.format(
                email['From'], email['Subject']))
            email_txt = '\n'.join(email['text'])
            email_txt = BeautifulSoup(email_txt).text
            email_txt = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))', '',
                               email_txt)

            freq = calc_freq(email_txt, topics)
            tags = relevant_topics(freq)
            print('Found the following tags:', ', '.join(tags))
            events = Event.find()
            matched_events = [
                event for event in events if event.get('emailSrc') and
                SequenceMatcher(a=event['emailSrc'], b=email_txt).ratio() > 0.9
            ]
            if matched_events:
                print('Ignoring, similar to {} other emails'.format(
                    len(matched_events)))
            else:
                Event.add({
                    'name': email['Subject'],
                    'description': email_txt,
                    'location': '',
                    'time': int(time.time()),
                    'tags': tags,
                    'emailSrc': email_txt
                })
            num_seen += 1
            num_seen_file.write().write(num_seen, str)