def elicit_activation_states(lstm, xys, activations_dir):
    activation_states = {}

    for key in lstm.keys():
        start_queue(activation_states, activations_dir, key)

    total = 0
    instances = 0

    for j, xy in enumerate(xys):
        total += 1
        instances += len(xy.x)
        stepwise_rnn = lstm.stepwise(handle_unknown=True)
        sequence = tuple(xy.x) + (xy.y[-1], )

        for i, word_pos in enumerate(xy.x):
            result, instruments = stepwise_rnn.step(word_pos[0],
                                                    rnn.LSTM_INSTRUMENTS)

            for part, layer in lstm.part_layers():
                activation_states[lstm.encode_key(part, layer)].put(
                    states.ActivationState(
                        sequence, i,
                        tuple([float(v) for v in instruments[part][layer]])))

    # Mark the queue as finished.
    for value in activation_states.values():
        value.put(None)

    user_log.info(
        "%d sentences, eliciting %d activation states (per part-layer)." %
        (total, instances))
Пример #2
0
def main(argv):
    ap = ArgumentParser(prog="measure-sequence-changes")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("data_dir")
    ap.add_argument("kind", choices=["train", "validation", "test"])
    ap.add_argument("sequential_dir")
    ap.add_argument("keys", nargs="+")
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir)
    minimum, maximum, sequence_changes = measure(lstm, aargs.data_dir,
                                                 aargs.kind, aargs.keys)

    for key in aargs.keys:
        distance, index, sequence = minimum[key]
        sequence_str, changes_str = stringify(sequence,
                                              sequence_changes[sequence][key])
        user_log.info("Global minimum for %s of %.4f @%d:\n  %s\n  %s" %
                      (key, distance, index, sequence_str, changes_str))
        distance, index, sequence = maximum[key]
        sequence_str, changes_str = stringify(sequence,
                                              sequence_changes[sequence][key])
        user_log.info("Global maximum for %s of %.4f @%d:\n  %s\n  %s" %
                      (key, distance, index, sequence_str, changes_str))

    return 0
Пример #3
0
def analyze(stream, kind):
    count = 0
    length = 0

    for item in stream:
        count += 1
        length += len(item.x)

    user_log.info("%s average length: %.4f" % (kind, length / float(count)))
Пример #4
0
def run(port, fe_converter):
    class ThreadingHTTPServer(ThreadingMixIn, HTTPServer):
        pass

    #patch_Thread_for_profiling()
    server_address = ('', port)
    httpd = ThreadingHTTPServer(server_address, ServerHandler)
    httpd.daemon_threads = True
    httpd.fe_converter = fe_converter
    httpd.sessions = {}
    user_log.info('Starting httpd %d...' % port)
    httpd.serve_forever()
def test_model(lstm, model, states_dir, is_baseline, key_set):
    as_input = as_input_fn(lstm, model)

    def stream_fn(key):
        for hidden_state in states.stream_hidden_test(states_dir, key):
            yield mlbase.Xy(as_input(key, hidden_state),
                            hidden_state.annotation)

    #user_log.info("Train data.")
    #_, _ = score_parts(model, stream_fn, False, is_baseline)
    user_log.info("Test data.")
    key_perplexity, total_perplexity = score_parts(lstm, model, stream_fn,
                                                   True, is_baseline, key_set)
    return key_perplexity, total_perplexity
def dry_run(xys, sample_rate, kind):
    total = 0
    sampled = 0
    instances = 0

    for j, xy in enumerate(xys):
        total += 1

        if sample_rate == 1.0 or random.random() <= sample_rate:
            sampled += 1
            instances += len(xy.x)

    user_log.info(
        "(dry run) %s %.4f: %d sentences sampled down to %d, eliciting %d hidden states (per part-layer)."
        % (kind, sample_rate, total, sampled, instances))
Пример #7
0
def main(argv):
    ap = ArgumentParser(prog="generate-reduction-buckets")
    ap.add_argument("-v", "--verbose", default=False, action="store_true", help="Turn on verbose logging.")
    ap.add_argument("--grouping", nargs="*", default=None)
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("states_dir")
    ap.add_argument("buckets_dir")
    ap.add_argument("target", type=int)
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0], aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True)
    part_learned_mse = {}
    part_fixed_mse = {}

    if aargs.grouping is None:
        for key in lstm.keys():
            learned_mse, fixed_mse = generate_buckets(aargs.states_dir, key, lstm.part_width(key), aargs.buckets_dir, aargs.target)
            part_learned_mse[key] = learned_mse
            part_fixed_mse[key] = fixed_mse
    else:
        learned_mse, fixed_mse = generate_buckets_grouping(lstm, aargs.states_dir, aargs.grouping, aargs.buckets_dir, aargs.target)
        part_learned_mse = learned_mse
        part_fixed_mse = fixed_mse

    with open(os.path.join(aargs.buckets_dir, "analysis.csv"), "w") as fh:
        writer = csv_writer(fh)
        writer.writerow(["technique", "key", "mse"])
        total_learned = 0.0
        total_fixed = 0.0
        count_learned = 0
        count_fixed = 0

        for key, error in sorted(part_learned_mse.items()):
            total_learned += error
            count_learned += 1
            writer.writerow(["learned", key, "%f" % error])

        for key, error in sorted(part_fixed_mse.items()):
            total_fixed += error
            count_fixed += 1
            writer.writerow(["fixed", key, "%f" % error])

        user_log.info("Total scores (learned, fixed): %s, %s" % (total_learned / count_learned, total_fixed / count_fixed))

    return 0
Пример #8
0
def main(argv):
    ap = ArgumentParser(prog="query-data")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument(
        "--limit",
        type=int,
        default=10,
        help=
        "Truncate the results at maximum LIMIT.  Negative indicates to find all (unlimited)."
    )
    ap.add_argument("--match",
                    choices=["include", "sequence", "relative"],
                    default="include")
    ap.add_argument("data_dir")
    ap.add_argument("kind", choices=["train", "test"])
    ap.add_argument("words", nargs="*", default=None)
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    if aargs.match == "relative":
        # Quickest way to implement relative is just to make it correct for N = 2.
        assert len(aargs.words) == 2

    truncated = False
    count = 0

    for xy in data.stream_data(aargs.data_dir, aargs.kind):
        # TODO: work for non-lm cases.
        sequence = [item[0] for item in xy.x] + [xy.y[-1][0]]

        if matches(sequence, aargs.words, aargs.match):
            count += 1
            logging.debug("Instance: %s" % " ".join(sequence))

        if aargs.limit > 0 and count >= aargs.limit:
            logging.debug("Truncating..")
            truncated = True
            break

    user_log.info("Found %d%s instances." %
                  (count, " (truncated)" if truncated else ""))
    return 0
Пример #9
0
def run_continuously(config, dry_run):
    #thestorm = dt.datetime(2020, 1, 12, 8)
    #thestorm.isoformat()
    darksky = DarkSky(config.darksky_key)
    already_snowing = False

    while True:
        now = dt.datetime.now(TIME_ZONE)
        print(now)
        forecast = get_forecast(darksky, config)
        snow_event = next_snowfall(forecast)

        if snow_event is None:
            # We don't have a clue when the next snowfall may be
            next_poll = LONG_POLL
            already_snowing = False
        else:
            if snow_event.time >= now:
                # The next snowfall is at some point in the future (or right this moment).
                duration_estimate = snow_event.time - now
            else:
                # This is a historical query - pretend now is the predicted time point.
                duration_estimate = EMPTY_DURATION

            if duration_estimate < SNOW_THRESHOLD:
                user_log.info("It's snowing in %s!" % config.location)
                send_message(snow_event.accumulation, already_snowing, config,
                             dry_run)
                next_poll = SNOW_POLL
                already_snowing = True
            else:
                if already_snowing:
                    user_log.info("Stopped")

                already_snowing = False
                estimate = dt.timedelta(seconds=int(duration_estimate.seconds *
                                                    0.2))

                if estimate > LONG_POLL:
                    # If the next predicted snowfall is too far in the future, cap it off at the long poll duration.
                    next_poll = LONG_POLL
                else:
                    next_poll = max(SNOW_THRESHOLD, estimate)

        logging.debug("Sleeping for %s seconds." % next_poll.seconds)
        time.sleep(next_poll.seconds)
Пример #10
0
def run_server(port,
               api_root,
               resource_path,
               handler_map,
               ip_whitelist=["127.0.0.1"]):
    class ThreadingHTTPServer(ThreadingMixIn, HTTPServer):
        pass

    server_address = ('', port)
    httpd = ThreadingHTTPServer(server_address, ServerHandler)
    httpd.daemon_threads = True
    httpd.api_root = api_root if api_root.endswith("/") else "%s/" % api_root
    httpd.resource_path = resource_path if resource_path.endswith(
        "/") else "%s/" % resource_path
    httpd.handlers = handler_map
    httpd.ip_whitelist = ip_whitelist
    user_log.info('Starting httpd %d...' % port)
    httpd.serve_forever()
def elicit_hidden_states(lstm, xys, annotation_fn, sample_rate, states_dir,
                         kind):
    hidden_states = {}

    for key in lstm.keys():
        start_queue(hidden_states, states_dir, kind, key)

    total = 0
    sampled = 0
    instances = 0

    for j, xy in enumerate(xys):
        total += 1

        if sample_rate == 1.0 or random.random() <= sample_rate:
            sampled += 1
            instances += len(xy.x)
            stepwise_rnn = lstm.stepwise(handle_unknown=True)

            for i, word_pos in enumerate(xy.x):
                # Set the annotation to that which the rnn has been trained against, not the actual learned annotation (which will be fixed).
                # For example, consider the two training examples: "the little prince" -> "was" and "the little prince" -> "is".
                # We need predictor samples for both "was" and "is", but if we use the actual rnn annotation this will fixate on just one of these.
                annotation = annotation_fn(xy.y, i)
                result, instruments = stepwise_rnn.step(
                    word_pos[0], rnn.LSTM_INSTRUMENTS)

                for part, layer in lstm.part_layers():
                    hidden_states[lstm.encode_key(part, layer)].put(
                        states.HiddenState(
                            word_pos[0],
                            tuple([float(v)
                                   for v in instruments[part][layer]]),
                            annotation))

    # Mark the queue as finished.
    for value in hidden_states.values():
        value.put(None)

    user_log.info(
        "%s %.4f: %d sentences sampled down to %d, eliciting %d hidden states (per part-layer)."
        % (kind, sample_rate, total, sampled, instances))
Пример #12
0
def run_server(port, words, neural_network, query_engine, pattern_engine):
    class ThreadingHTTPServer(ThreadingMixIn, HTTPServer):
        pass

    server_address = ('', port)
    httpd = ThreadingHTTPServer(server_address, ServerHandler)
    httpd.daemon_threads = True
    httpd.handlers = {
        "echo": handlers.Echo(),
        "weight-explain": handlers.WeightExplain(neural_network),
        "weights": handlers.Weights(neural_network),
        "weight-detail": handlers.WeightDetail(neural_network),
        "words": handlers.Words(words.labels()),
        "sequence-matches": handlers.SequenceMatches(query_engine),
        "sequence-matches-estimate":
        handlers.SequenceMatchesEstimate(query_engine),
        "soft-filters": handlers.SoftFilters(neural_network),
        "pattern-matches": handlers.PatternMatches(pattern_engine),
    }
    user_log.info('Starting httpd %d...' % port)
    httpd.serve_forever()
def main(argv):
    ap = ArgumentParser(prog="language-model")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("--corpus", default="corpus.txt")
    ap.add_argument("--epochs", default=100, type=int)
    args = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  args.verbose, False, True)
    words, xy_sequences, neural_network = domain.create(
        args.corpus, args.epochs, args.verbose)

    #while neural_network.is_setting_up():
    #    pass

    neural_network._background_training.join()
    accuracy = neural_network.lstm.test(
        [[rnn.Xy(t[0], t[1]) for t in sequence] for sequence in xy_sequences],
        True)
    user_log.info("accuracy: %s" % accuracy)
def score_parts(lstm, model, stream_fn, debug, is_baseline, key_set):
    key_perplexity = {}
    total_perplexity = 0.0
    count = 0

    for key in lstm.keys():
        if key_set is None or key in key_set:
            if is_baseline and count == 1:
                # We don't need to run across all the keys for the baseline - they would all be the same.
                break

            count += 1
            perplexity = model.test(lambda: stream_fn(key), False)
            key_perplexity[key] = perplexity

            if debug:
                logging.debug("Perplexity for '%s': %.6f" % (key, perplexity))

            total_perplexity += perplexity

    total_perplexity = total_perplexity / count
    user_log.info("Total perplexity: %.6f" % total_perplexity)
    return key_perplexity, total_perplexity
Пример #15
0
 def __init__(self, words):
     self.words = sorted([w for w in words])
     user_log.info("Vocabulary %d" % len(self.words))
def main(argv):
    ap = ArgumentParser(prog="generate-semantic-model")
    ap.add_argument("-v",
                    "--verbose",
                    default=False,
                    action="store_true",
                    help="Turn on verbose logging.")
    ap.add_argument("-i", "--initial-decays", default=5, type=int)
    ap.add_argument("-c", "--convergence-decays", default=2, type=int)
    ap.add_argument("-a", "--arc-epochs", default=3, type=int)
    ap.add_argument("-l", "--layers", default=2, type=int)
    ap.add_argument("-w", "--width", default=100, type=int)
    ap.add_argument("--word-input", default=False, action="store_true")
    ap.add_argument("-p", "--pre-existing", default=False, action="store_true")
    ap.add_argument("-m", "--monolith", default=False, action="store_true")
    ap.add_argument("--key-set", nargs="*", default=None)
    ap.add_argument("data_dir")
    ap.add_argument("sequential_dir")
    ap.add_argument("states_dir")
    ap.add_argument("encoding_dir")
    aargs = ap.parse_args(argv)
    setup_logging(".%s.log" % os.path.splitext(os.path.basename(__file__))[0],
                  aargs.verbose, False, True, True)
    logging.debug(aargs)

    lstm = sequential.load_model(aargs.data_dir, aargs.sequential_dir, True)
    user_log.info("Sem")
    hyper_parameters = model.HyperParameters(aargs.layers, aargs.width)
    extra = {
        "word_input": aargs.word_input,
        "monolith": aargs.monolith,
    }

    if aargs.pre_existing:
        sem = load_sem(lstm, aargs.encoding_dir)
    else:
        sem = generate_sem(lstm, hyper_parameters, extra, aargs.states_dir,
                           aargs.arc_epochs, aargs.encoding_dir, aargs.key_set,
                           aargs.initial_decays, aargs.convergence_decays)

    keys_sem, total_sem = test_model(lstm, sem, aargs.states_dir, False,
                                     aargs.key_set)
    # TODO
    #user_log.info("Baseline")
    #baseline = generate_baseline(aargs.data_dir, lstm, hyper_parameters, extra)
    #scores_baseline, totals_baseline = test_model(lstm, baseline, aargs.states_dir, True, aargs.key_set)

    with open(os.path.join(aargs.encoding_dir, "analysis-breakdown.csv"),
              "w") as fh:
        writer = csv_writer(fh)
        writer.writerow(["technique", "key", "perplexity"])

        for key, perplexity in sorted(keys_sem.items()):
            writer.writerow(["sem", key, "%f" % perplexity])

        #for key, scores in sorted(scores_baseline.items()):
        #    for name, score in sorted(scores.items()):
        #        writer.writerow(["baseline", key, name, "%f" % score])

    with open(os.path.join(aargs.encoding_dir, "analysis-totals.csv"),
              "w") as fh:
        writer = csv_writer(fh)
        writer.writerow(["technique", "perplexity"])
        writer.writerow(["sem", "%f" % total_sem])

        #for name, score in sorted(totals_baseline.items()):
        #    writer.writerow(["baseline", name, "%f" % score])

    return 0
Пример #17
0
def categorize_rates(lstm, xys, dimensions, report):
    total = 0
    non_monotonic = 0
    non_monotonic_counts = {}
    starts = {
        "global": {dimension: 0
                   for dimension in dimensions},
        "monotonic": {dimension: 0
                      for dimension in dimensions},
        "non-monotonic": {dimension: 0
                          for dimension in dimensions},
    }
    ends = {
        "global": {dimension: 0
                   for dimension in dimensions},
        "monotonic": {dimension: 0
                      for dimension in dimensions},
        "non-monotonic": {dimension: 0
                          for dimension in dimensions},
    }
    global_lowest1 = {
        dimension: (None, None, None)
        for dimension in dimensions
    }
    global_lowest2 = {
        dimension: (None, None, None)
        for dimension in dimensions
    }
    global_lowest3 = {
        dimension: (None, None, None)
        for dimension in dimensions
    }
    largest_drop = {dimension: (None, None, None) for dimension in dimensions}
    minimum_growth = {
        dimension: (None, None, None)
        for dimension in dimensions
    }

    for j, xy in enumerate(xys):
        if j % 1000 == 0:
            logging.debug("At the %d-Kth instance." % (int(j / 1000)))

        sequence = [item[0] for item in xy.x]
        total += 1
        stepwise_rnn = lstm.stepwise(handle_unknown=True)
        cells = []
        previous = None
        index = None

        for i, word_pos in enumerate(xy.x):
            result, instruments = stepwise_rnn.step(word_pos[0], ["cells"])
            state = instruments["cells"][0]
            activations = [state[dimension] for dimension in dimensions]
            cells += [activations]

            if previous is not None and any([
                    current < (previous[k] * 0.75)
                    for k, current in enumerate(activations)
            ]):
                index = i

            previous = activations

        for k, dimension in enumerate(dimensions):
            ck = [c[k] for c in cells]

            if global_lowest1[dimension][0] is None or lower1(
                    ck, global_lowest1[dimension][1]):
                global_lowest1[dimension] = ("moot", ck, sequence)

            if global_lowest2[dimension][0] is None or (
                    sum(ck) / len(ck)) < global_lowest2[dimension][0]:
                global_lowest2[dimension] = (sum(ck) / len(ck), ck, sequence)

            if global_lowest3[dimension][0] is None or min(
                    ck) < global_lowest3[dimension][0]:
                global_lowest3[dimension] = (min(ck), ck, sequence)

            for i in range(len(ck) - 1):
                if largest_drop[dimension][0] is None or (
                        ck[i + 1] - ck[i]) < largest_drop[dimension][0]:
                    largest_drop[dimension] = (ck[i + 1] - ck[i], ck, sequence)

            if len(ck) > 1:
                if minimum_growth[dimension][0] is None or (
                        ck[-1] - ck[0]) < minimum_growth[dimension][0]:
                    minimum_growth[dimension] = (ck[-1] - ck[0], ck, sequence)

            starts["global"][dimension] += cells[0][k]
            ends["global"][dimension] += cells[-1][k]

            if index is None:
                starts["monotonic"][dimension] += cells[0][k]
                ends["monotonic"][dimension] += cells[-1][k]
            else:
                starts["non-monotonic"][dimension] += cells[0][k]
                ends["non-monotonic"][dimension] += cells[-1][k]

        if index is not None:
            if report:
                logging.debug("non-monotonic @%d (%s): %s -> %s" %
                              (index, sequence[index], " ".join(sequence),
                               " ".join([str(c) for c in cells])))

            non_monotonic += 1

            if sequence[index] not in non_monotonic_counts:
                non_monotonic_counts[sequence[index]] = 0

            non_monotonic_counts[sequence[index]] += 1

    user_log.info("Found %d of %d sentences to match non-monotonic criteria." %
                  (non_monotonic, total))
    user_log.info("Non-monotonic keyword frequencies: %s" %
                  (adjutant.dict_as_str(
                      non_monotonic_counts, sort_by_key=False, reverse=True)))

    for dimension in dimensions:
        user_log.info("Global lowest @%d (by progression): %s" %
                      (dimension, global_lowest1[dimension]))
        user_log.info("Global lowest @%d (by average): %s" %
                      (dimension, global_lowest2[dimension]))
        user_log.info("Global lowest @%d (by single minimum): %s" %
                      (dimension, global_lowest3[dimension]))
        user_log.info("Global lowest @%d (by largest drop): %s" %
                      (dimension, largest_drop[dimension]))
        user_log.info("Global lowest @%d (by minimum growth): %s" %
                      (dimension, minimum_growth[dimension]))

    averages = {
        "global": {
            dimension: (starts["global"][dimension] / total,
                        ends["global"][dimension] / total)
            for dimension in dimensions
        },
        "monotonic": {
            dimension:
            (starts["monotonic"][dimension] / (total - non_monotonic),
             ends["monotonic"][dimension] / (total - non_monotonic))
            for dimension in dimensions
        },
        "non-monotonic": {
            dimension: (starts["non-monotonic"][dimension] / non_monotonic,
                        ends["non-monotonic"][dimension] / non_monotonic)
            for dimension in dimensions
        },
    }
    logging.debug(adjutant.dict_as_str(averages))
    return averages