Exemplo n.º 1
0
def test_cues_outcomes():
    n_events, cues, outcomes = count.cues_outcomes(EVENT_RESOURCE_FILE)
    n_events3, cues3, outcomes3 = count.cues_outcomes(EVENT_RESOURCE_FILE,
                                                      number_of_processes=6,
                                                      verbose=True)
    assert n_events == 2772
    assert n_events == n_events3
    assert cues == cues3
    assert outcomes == outcomes3
Exemplo n.º 2
0
def compare_arrays(file_path, arr1, arr2):
    n_events, cues, outcomes = count.cues_outcomes(file_path)
    cue_map, outcome_map, all_outcomes = generate_mapping(file_path)

    cue_indices = [cue_map[cue] for cue in cues]
    outcome_indices = [outcome_map[outcome] for outcome in outcomes]
    unequal = list()

    for outcome in outcomes:
        for cue in cues:
            values = list()
            for array in (arr1, arr2):
                if isinstance(array, np.ndarray):
                    outcome_index = outcome_map[outcome]
                    cue_index = cue_map[cue]
                    values.append(array[outcome_index][cue_index])
                elif isinstance(array, xr.DataArray):
                    values.append(array.loc[{
                        'outcomes': outcome,
                        'cues': cue
                    }].values)
                elif isinstance(array, pd.DataFrame):
                    values.append(array.loc[outcome][cue])
                else:
                    values.append(array[outcome][cue])

            value1, value2 = values
            if not np.isclose(value1, value2, rtol=1e-02, atol=1e-05):
                unequal.append((outcome, cue, value1, value2))

    unequal_ratio = len(unequal) / (len(outcomes) * len(cues))
    return (unequal, unequal_ratio)
Exemplo n.º 3
0
def test_save_load():
    file_name = os.path.join(TEST_ROOT, "temp/cues.tab")
    _, cues, _ = count.cues_outcomes(EVENT_RESOURCE_FILE)
    count.save_counter(cues, file_name)
    cues_loaded = count.load_counter(file_name)
    assert cues == cues_loaded
    os.remove(file_name)
Exemplo n.º 4
0
def generate_mapping(event_path):
    n_events, cues, outcomes = count.cues_outcomes(event_path)
    all_cues = list(cues.keys())
    all_outcomes = list(outcomes.keys())
    cue_map = OrderedDict(((cue, ii) for ii, cue in enumerate(all_cues)))
    outcome_map = OrderedDict(
        ((outcome, ii) for ii, outcome in enumerate(all_outcomes)))

    return (cue_map, outcome_map, all_outcomes)
Exemplo n.º 5
0
def filter_tagged_event_file(input_event_file,
                             filtered_event_file,
                             cues, outcomes,
                             fill_cues=0,
                             fill_outcomes=0,
                             overwrite=False,
                             number_of_processes=1):
    """Filters event file with tokens and tags merged for collections of
    untagged cues and outcomes.

    Parameters
    ----------
    input_event_file : str or path
        Path to event file with tokens and tags merged
    filtered_event_file : str or path
        Path to resulting event file
    cues : collection
        Collection of target cues (without tags)
    outcomes : collection
        Collection of taret outcomes (without tags)
    fill_cues : int
        Fill cues with most frequent words to size fill_cues.
        If 0, no words will be added.
    fill_outcomes : int
        Fill outcomes with most frequent words to size fill_outcomes.
        If 0, no words will be added.
    overwrite : bool
        Overwrite filtered_event_path if exists
    number_of_processes : int
        Number of processes to use
    """
    if exists(filtered_event_file) and not overwrite:
        msg = f"'{filtered_event_file}' already exists and overwrite=False!"
        raise OSError(msg)

    counts = cues_outcomes(input_event_file,
                           number_of_processes=number_of_processes)
    _, all_cues, all_outcomes = counts

    cues = filter_tagged_vocabulary(all_cues, cues)
    outcomes = filter_tagged_vocabulary(all_outcomes, outcomes)

    if fill_cues:
        cues = add_most_frequent(cues, all_cues, fill_cues)

    if fill_outcomes:
        outcomes = add_most_frequent(outcomes, all_outcomes, fill_outcomes)

    filter_event_file(input_event_file, filtered_event_file,
                      keep_cues=cues, keep_outcomes=outcomes,
                      number_of_processes=number_of_processes)
Exemplo n.º 6
0
def test_preprocessing():
    corpus_file = os.path.join(TEST_ROOT, "resources/corpus.txt")
    event_file = os.path.join(TEST_ROOT, "temp/events_corpus.tab.gz")
    symbols = "abcdefghijklmnopqrstuvwxyzóąćęłńśźż"  # polish

    # create event file
    create_event_file(corpus_file, event_file, symbols,
                      context_structure="document",
                      event_structure="consecutive_words",
                      event_options=(3, ),
                      lower_case=True, verbose=True)

    # read in cues and outcomes
    n_events, cue_freq_map, outcome_freq_map = cues_outcomes(event_file,
                                                             number_of_processes=2)
    cues = list(cue_freq_map.keys())
    cues.sort()
    cue_id_map = {cue: ii for ii, cue in enumerate(cues)}

    # reduce number of outcomes through bandsampling
    outcome_freq_map_filtered = bandsample(outcome_freq_map, 50, cutoff=1, seed=None)
    outcomes = list(outcome_freq_map_filtered.keys())
    outcomes.sort()
    outcome_id_map = {outcome: nn for nn, outcome in enumerate(outcomes)}

    # filter outcomes by reduced number of outcomes
    event_file_filtered = event_file + ".filtered"
    filter_event_file(event_file, event_file_filtered, keep_outcomes=outcomes)

    # TODO this is not working at the moment
    # create binary event files
    # path_name = event_file_filtered + ".events"
    # create_binary_event_files(event_file_filtered, path_name, cue_id_map,
    #                           outcome_id_map, sort_within_event=False,
    #                           number_of_processes=2, events_per_file=1000,
    #                           verbose=True)
    # with pytest.raises(IOError):
    #    create_binary_event_files(event_file_filtered, path_name, cue_id_map,
    #                            outcome_id_map, sort_within_event=False,
    #                            number_of_processes=2, events_per_file=1000,
    #                            verbose=True)
    # overwrite=True
    # create_binary_event_files(event_file_filtered, path_name, cue_id_map,
    #                        outcome_id_map, sort_within_event=False,
    #                        number_of_processes=2, events_per_file=1000,
    #                        overwrite=True, verbose=True)

    # clean everything
    os.remove(event_file)
    os.remove(event_file_filtered)
Exemplo n.º 7
0
def test_bandsample():
    resource_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word.tab.gz")
    n_events, cue_freq_map, outcome_freq_map = cues_outcomes(resource_file,
                                                             number_of_processes=2)
    outcome_freq_map_filtered = bandsample(outcome_freq_map, 50, cutoff=1, seed=None, verbose=False)
    assert len(outcome_freq_map_filtered) == 50

    reference_file = os.path.join(TEST_ROOT, 'reference/bandsampled_outcomes.tab')
    try:
        outcome_freq_map_filtered_reference = load_counter(reference_file)
    except (FileNotFoundError):
        temp_file = os.path.join(TEST_ROOT, 'temp/bandsampled_outcomes.tab')
        save_counter(outcome_freq_map_filtered, temp_file)
        raise

    bandsample(outcome_freq_map, 50, cutoff=1, verbose=True)
Exemplo n.º 8
0
def test_write_events():
    event_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word.tab.gz")
    n_events, cue_freq_map, outcome_freq_map = cues_outcomes(event_file)
    outcomes = list(outcome_freq_map.keys())
    outcomes.sort()
    cues = list(cue_freq_map.keys())
    cues.sort()
    cue_id_map = {cue: ii for ii, cue in enumerate(cues)}
    outcome_id_map = {outcome: nn for nn, outcome in enumerate(outcomes)}
    events = event_generator(event_file, cue_id_map, outcome_id_map, sort_within_event=True)
    file_name = os.path.join(TEST_ROOT, "temp/events.bin")
    with pytest.raises(StopIteration):
        write_events(events, file_name, remove_duplicates=True)
    os.remove(file_name)

    # start stop
    events = event_generator(event_file, cue_id_map, outcome_id_map, sort_within_event=True)
    n_events = write_events(events, file_name, start=10, stop=20, remove_duplicates=True)
    assert n_events == 10
    os.remove(file_name)

    # no events
    events = event_generator(event_file, cue_id_map, outcome_id_map, sort_within_event=True)
    n_events = write_events(events, file_name, start=100000, stop=100010, remove_duplicates=True)
    assert n_events == 0

    _job_binary_event_file(file_name=file_name, event_file=event_file,
                           cue_id_map=cue_id_map,
                           outcome_id_map=outcome_id_map,
                           sort_within_event=False,
                           start=0, stop=10, remove_duplicates=True)
    _job_binary_event_file(file_name=file_name, event_file=event_file,
                           cue_id_map=cue_id_map,
                           outcome_id_map=outcome_id_map,
                           sort_within_event=False,
                           start=0, stop=10, remove_duplicates=True)
    os.remove(file_name)

    # bad event file
    with pytest.raises(ValueError):
        event_bad_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word_BAD.tab.gz")
        events = event_generator(event_bad_file, cue_id_map,
                                 outcome_id_map)
        # traverse generator
        for event in events:
            pass
Exemplo n.º 9
0
def test_filter_event_file():
    input_event_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word.tab.gz")
    output_event_file = os.path.join(TEST_ROOT, "temp/event_file_filtered.tab.gz")
    cues = ["#of", "of#"]
    cues.sort()
    outcomes = ["of", ]
    outcomes.sort()
    filter_event_file(input_event_file, output_event_file,
                      keep_cues=cues,
                      keep_outcomes=outcomes,
                      number_of_processes=2,
                      verbose=True)
    n_events, cue_freq_map, outcome_freq_map = cues_outcomes(output_event_file)
    cues_new = list(cue_freq_map)
    cues_new.sort()
    outcomes_new = list(outcome_freq_map)
    outcomes_new.sort()
    assert cues == cues_new
    assert outcomes == outcomes_new
    os.remove(output_event_file)
Exemplo n.º 10
0
def test_read_binary_file():
    file_path = "resources/event_file_trigrams_to_word.tab.gz"
    binary_path = "binary_resources/"

    abs_file_path = os.path.join(TEST_ROOT, file_path)
    abs_binary_path = os.path.join(TEST_ROOT, binary_path)
    abs_binary_file_path = os.path.join(abs_binary_path, "events_0_0.dat")

    n_events, cues, outcomes = cues_outcomes(abs_file_path)
    cue_id_map = OrderedDict(((cue, ii) for ii, cue in enumerate(cues.keys())))
    outcome_id_map = OrderedDict(((outcome, ii) for ii, outcome in enumerate(outcomes.keys())))

    number_events = create_binary_event_files(abs_file_path, abs_binary_path, cue_id_map,
                                              outcome_id_map, overwrite=True, remove_duplicates=False)

    bin_events = read_binary_file(abs_binary_file_path)
    events = ndl.events_from_file(abs_file_path)
    events_dup = ndl.events_from_file(abs_file_path)

    assert number_events == len(list(events_dup))

    for event, bin_event in zip(events, bin_events):
        cues, outcomes = event
        bin_cues, bin_outcomes = bin_event
        if len(cues) != len(bin_cues):
            raise ValueError('Cues have different length')
        if len(outcomes) != len(bin_outcomes):
            raise ValueError('Cues have different length')

        for cue, bin_cue in zip(cues, bin_cues):
            assert cue_id_map[cue] == bin_cue

        for outcome, bin_outcome in zip(outcomes, bin_outcomes):
            assert outcome_id_map[outcome] == bin_outcome

    # clean everything
    os.remove(abs_binary_file_path)