Exemplo n.º 1
0
def start_analysis():
    """Read input data and start up the analysis."""
    input_data_path = FLAGS.input_data

    LOG.info("Reading input data ...")
    if FLAGS.input_form == "binary":
        data_set = read_binary_input(input_data_path)
    elif FLAGS.input_form == "text":
        data_set = read_text_input(input_data_path)
    elif FLAGS.input_form == "json":
        data_set = read_json_input(input_data_path)
    else:
        LOG.error(
            "Unknown input_form. Needs to be 'binary', 'text', or 'json'.")
    LOG.info('Read %s sequences', len(data_set.sequences))

    if data_set.logged_method_name:
        PFE_METHODS.append(
            logged_pfe_method.for_name(data_set.logged_method_name))

    LOG.info("Preparing input data.")
    # the sequence proto's need to be serialized since they are being
    # sent to another process.
    sequences = [
        sequence.SerializeToString() for sequence in data_set.sequences
        if languages.should_keep(sequence.language)
    ]
    segmented_sequences, segment_size = segment_sequences(
        sequences, FLAGS.parallelism * 2)

    LOG.info("Running simulations on %s sequences.", len(sequences))
    if FLAGS.parallelism > 1:
        with Pool(FLAGS.parallelism) as pool:
            results = merge_results(pool.map(do_analysis, segmented_sequences),
                                    segment_size)
    else:
        results = merge_results([do_analysis(s) for s in segmented_sequences],
                                segment_size)

    if results.failed_indices:
        LOG.info("%s sequences dropped due to errors in simulation.",
                 len(results.failed_indices))
        if FLAGS.failed_indices_out:
            write_failed_indices(results.failed_indices)

    LOG.info("Formatting output.")
    results = to_protos(results.totals_by_method, cost.cost)

    results_proto = result_pb2.AnalysisResultProto()
    for method_result in results:
        results_proto.results.append(method_result)

    return results_proto
Exemplo n.º 2
0
def main(argv):
    """Runs the analysis."""
    del argv  # Unused.

    data_set = read_binary_input(FLAGS.input_data)

    sequence_list = []

    for seq in data_set.sequences:
        if not languages.should_keep(seq.language):
            continue

        if not sample():
            continue

        sequence_list.append(seq)

    del data_set.sequences[:]
    data_set.sequences.extend(sequence_list)

    sys.stdout.buffer.write(data_set.SerializeToString())
Exemplo n.º 3
0
 def test_should_keep_default(self):
     self.assertTrue(languages.should_keep("hello"))
Exemplo n.º 4
0
 def test_script_category_takes_priority(self):
     self.assertFalse(languages.should_keep("en"))
     self.assertTrue(languages.should_keep("ja"))
     self.assertTrue(languages.should_keep("zh"))
Exemplo n.º 5
0
 def test_with_invalid_script_category(self):
     self.assertTrue(languages.should_keep("en"))
     self.assertTrue(languages.should_keep("ja"))
     self.assertTrue(languages.should_keep("zh"))
Exemplo n.º 6
0
 def test_with_language_filter(self):
     self.assertTrue(languages.should_keep("en"))
     self.assertTrue(languages.should_keep("ja"))
     self.assertFalse(languages.should_keep("zh"))