Пример #1
0
def test_aclew(project):
    data = pd.read_csv("tests/data/aclew.csv")

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": set,
            "raw_filename": "file.rttm",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 4000,
            "format": "rttm",
        } for set in ["aclew_vtc", "aclew_alice", "aclew_vcm"]]),
        import_function=partial(fake_vocs, data),
    )

    aclew = AclewMetrics(project,
                         by="child_id",
                         rec_cols='date_iso',
                         child_cols='experiment,child_dob',
                         vtc='aclew_vtc',
                         alice='aclew_alice',
                         vcm='aclew_vcm')
    aclew.extract()

    truth = pd.read_csv("tests/truth/aclew_metrics.csv")

    pd.testing.assert_frame_equal(aclew.metrics, truth, check_like=True)
Пример #2
0
def test_lena(project):
    data = pd.read_csv("tests/data/lena_its.csv")

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": "lena_its",
            "raw_filename": "file.its",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 100000000,
            "format": "its",
        }]),
        import_function=partial(fake_vocs, data),
    )

    lena = LenaMetrics(project,
                       set="lena_its",
                       period='1h',
                       from_time='10:00:00',
                       to_time='16:00:00')
    lena.extract()

    truth = pd.read_csv("tests/truth/lena_metrics.csv")

    pd.testing.assert_frame_equal(lena.metrics, truth, check_like=True)
Пример #3
0
def test_intersect(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/intersect.csv")
    am.import_annotations(input_annotations)

    intersection = AnnotationManager.intersection(
        am.annotations[am.annotations["set"].isin(["textgrid", "vtc_rttm"])]
    ).convert_dtypes()

    a = intersection[intersection["set"] == "textgrid"]
    b = intersection[intersection["set"] == "vtc_rttm"]

    columns = a.columns.tolist()
    columns.remove("imported_at")
    columns.remove("package_version")
    columns.remove("merged_from")

    pd.testing.assert_frame_equal(
        standardize_dataframe(a, columns),
        standardize_dataframe(
            pd.read_csv("tests/truth/intersect_a.csv"), columns
        ).convert_dtypes(),
    )

    pd.testing.assert_frame_equal(
        standardize_dataframe(b, columns),
        standardize_dataframe(
            pd.read_csv("tests/truth/intersect_b.csv"), columns
        ).convert_dtypes(),
    )
Пример #4
0
def test_specs(project):
    data = pd.read_csv("tests/data/lena_its.csv")

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": "specs_its",
            "raw_filename": "file.its",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 100000000,
            "format": "its",
        }]),
        import_function=partial(fake_vocs, data),
    )

    msp = MetricsSpecificationPipeline()

    parameters = "tests/data/parameters_metrics.yml"
    msp.run(parameters)

    output = pd.read_csv(msp.destination)
    truth = pd.read_csv("tests/truth/specs_metrics.csv")

    pd.testing.assert_frame_equal(output, truth, check_like=True)

    new_params = msp.parameters_path
    msp.run(new_params)

    output = pd.read_csv(msp.destination)

    pd.testing.assert_frame_equal(output, truth, check_like=True)
Пример #5
0
def test_random_vocalization(project):
    segments = [{
        'segment_onset': 1000,
        'segment_offset': 2000,
        'speaker_type': speaker
    } for speaker in ['CHI', 'FEM', 'MAL']]

    segments = pd.DataFrame(segments)

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": "random",
            "raw_filename": "file.rttm",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 4000,
            "format": "rttm",
        }]),
        import_function=partial(fake_conversation, segments),
    )

    sampler = RandomVocalizationSampler(project=project,
                                        annotation_set="random",
                                        target_speaker_type=["CHI"],
                                        sample_size=1,
                                        threads=1)
    sampler.sample()

    chi_segments = segments[segments["speaker_type"] == "CHI"]
    pd.testing.assert_frame_equal(
        sampler.segments[["segment_onset", "segment_offset"]].astype(int),
        chi_segments[["segment_onset", "segment_offset"]].astype(int))
Пример #6
0
def test_custom(project):
    am = AnnotationManager(project)

    data = pd.read_csv("tests/data/lena_its.csv")

    am.import_annotations(
        pd.DataFrame([{
            "set": "custom_its",
            "raw_filename": "file.its",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 100000000,
            "format": "its",
        }]),
        import_function=partial(fake_vocs, data),
    )

    parameters = "tests/data/list_metrics.csv"
    cmm = CustomMetrics(project, parameters)
    cmm.extract()

    truth = pd.read_csv("tests/truth/custom_metrics.csv")

    pd.testing.assert_frame_equal(cmm.metrics, truth, check_like=True)
Пример #7
0
def import_annotations(args):
    """convert and import a set of annotations"""

    project = ChildProject(args.source)
    errors, warnings = project.validate_input_data()

    if len(errors) > 0:
        print("validation failed, {} error(s) occured".format(len(errors)),
              file=sys.stderr)
        sys.exit(1)

    if args.annotations:
        annotations = pd.read_csv(args.annotations)
    else:
        annotations = pd.DataFrame([{
            col.name: getattr(args, col.name)
            for col in AnnotationManager.INDEX_COLUMNS if not col.generated
        }])

    am = AnnotationManager(project)
    am.import_annotations(annotations)

    errors, warnings = am.validate()

    if len(am.errors) > 0:
        print("importation completed with {} errors and {} warnings".format(
            len(am.errors) + len(errors), len(warnings)),
              file=sys.stderr)
        print("\n".join(am.errors), file=sys.stderr)
        print("\n".join(errors), file=sys.stderr)
        print("\n".join(warnings))
Пример #8
0
def test_segments_timestamps(project):
    am = AnnotationManager(project)

    segments = pd.DataFrame(
        [
            {
                "recording_filename": "sound.wav",
                "segment_onset": 3600 * 1000,
                "segment_offset": 3600 * 1000 + 1000,
            }
        ]
    )
    segments = am.get_segments_timestamps(segments)

    truth = pd.DataFrame(
        [
            {
                "recording_filename": "sound.wav",
                "segment_onset": 3600 * 1000,
                "segment_offset": 3600 * 1000 + 1000,
                "onset_time": datetime.datetime(2020, 4, 20, 9 + 1, 0, 0),
                "offset_time": datetime.datetime(2020, 4, 20, 9 + 1, 0, 1),
            }
        ]
    )

    pd.testing.assert_frame_equal(
        standardize_dataframe(segments, truth.columns),
        standardize_dataframe(truth, truth.columns),
    )
Пример #9
0
def test_conversation_sampler(project):
    conversations = [
        {
            "onset": 0,
            "vocs": 5
        },
        {
            "onset": 60 * 1000,
            "vocs": 10
        },
        {
            "onset": 1800 * 1000,
            "vocs": 15
        },
    ]
    segments = []
    for conversation in conversations:
        segments += [{
            "segment_onset":
            conversation["onset"] + i * (2000 + 500),
            "segment_offset":
            conversation["onset"] + i * (2000 + 500) + 2000,
            "speaker_type": ["FEM", "CHI"][i % 2],
        } for i in range(conversation["vocs"])]
    segments = pd.DataFrame(segments)

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": "conv",
            "raw_filename": "file.rttm",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 3600 * 1000 * 1000,
            "format": "rttm",
        }]),
        import_function=partial(fake_conversation, segments),
    )
    sampler = ConversationSampler(
        project,
        "conv",
        count=5,
        interval=1000,
        speakers=["FEM", "CHI"],
    )
    sampler.sample()

    assert len(sampler.segments) == len(conversations)
    assert sampler.segments["segment_onset"].tolist() == [
        conv["onset"] for conv in sorted(
            conversations, key=lambda c: c["vocs"], reverse=True)
    ]
def test_import(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/input.csv')
    am.import_annotations(input_annotations)
    am.read()

    assert am.annotations.shape[0] == input_annotations.shape[
        0], "imported annotations length does not match input"

    assert all([
        os.path.exists(os.path.join(project.path, 'annotations', f))
        for f in am.annotations['annotation_filename'].tolist()
    ]), "some annotations are missing"

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(
        warnings) == 0, "malformed annotations detected"

    for dataset in ['eaf', 'textgrid', 'eaf_solis']:
        annotations = am.annotations[am.annotations['set'] == dataset]
        segments = am.get_segments(annotations)
        segments.drop(columns=annotations.columns, inplace=True)

        pd.testing.assert_frame_equal(
            segments.sort_index(axis=1).sort_values(
                segments.columns.tolist()).reset_index(drop=True),
            pd.read_csv('tests/truth/{}.csv'.format(dataset)).sort_index(
                axis=1).sort_values(
                    segments.columns.tolist()).reset_index(drop=True),
            check_less_precise=True)
Пример #11
0
def test_within_ranges(project):
    am = AnnotationManager(project)

    annotations = [
        {
            "recording_filename": "sound.wav",
            "set": "matching",
            "range_onset": onset,
            "range_offset": onset + 500,
        }
        for onset in np.arange(0, 4000, 500)
    ]

    matching_annotations = pd.DataFrame(
        [
            annotation
            for annotation in annotations
            if annotation["range_onset"] >= 1000 and annotation["range_offset"] <= 3000
        ]
    )

    am.annotations = pd.DataFrame(annotations)

    ranges = pd.DataFrame(
        [{"recording_filename": "sound.wav", "range_onset": 1000, "range_offset": 3000}]
    )

    matches = am.get_within_ranges(ranges, ["matching"])

    pd.testing.assert_frame_equal(
        standardize_dataframe(matching_annotations, matching_annotations.columns),
        standardize_dataframe(matches, matching_annotations.columns),
    )

    ranges["range_offset"] = 5000
    exception_caught = False
    try:
        matches = am.get_within_ranges(ranges, ["matching"], "raise")
    except Exception as e:
        if str(e) == "annotations from set 'matching' do not cover the whole selected range for recording 'sound.wav', 3.000s covered instead of 4.000s":
            exception_caught = True

    assert (
        exception_caught
    ), "get_within_ranges should raise an exception when annotations do not fully cover the required ranges"
Пример #12
0
def test_import(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    am.import_annotations(input_annotations)
    am.read()

    assert (
        am.annotations.shape[0] == input_annotations.shape[0]
    ), "imported annotations length does not match input"

    assert all(
        [
            os.path.exists(
                os.path.join(
                    project.path,
                    "annotations",
                    a["set"],
                    "converted",
                    a["annotation_filename"],
                )
            )
            for a in am.annotations.to_dict(orient="records")
        ]
    ), "some annotations are missing"

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected"
    
    errors, warnings = am.read()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected"

    for dataset in ["eaf_basic", "textgrid", "eaf_solis"]:
        annotations = am.annotations[am.annotations["set"] == dataset]
        segments = am.get_segments(annotations)
        segments.drop(columns=set(annotations.columns) - {"raw_filename"}, inplace=True)
        truth = pd.read_csv("tests/truth/{}.csv".format(dataset))

        print(segments)
        print(truth)

        pd.testing.assert_frame_equal(
            standardize_dataframe(segments, set(truth.columns.tolist())),
            standardize_dataframe(truth, set(truth.columns.tolist())),
            check_less_precise=True,
        )
Пример #13
0
def test_within_time_range(project):
    from ChildProject.utils import TimeInterval
    am = AnnotationManager(project)
    am.project.recordings = pd.read_csv("tests/data/time_range_recordings.csv")

    annotations = pd.read_csv("tests/data/time_range_annotations.csv")
    matches = am.get_within_time_range(annotations, TimeInterval(datetime.datetime(1900,1,1,9,0),datetime.datetime(1900,1,1,20,0)))

    truth = pd.read_csv("tests/truth/time_range.csv")

    pd.testing.assert_frame_equal(
        standardize_dataframe(matches, truth.columns),
        standardize_dataframe(truth, truth.columns),
    )

    exception_caught = False
    try:
        matches = am.get_within_time_range(annotations, "9am", "8pm")
    except ValueError as e:
        exception_caught = True

    assert exception_caught, "no exception was thrown despite invalid times"
Пример #14
0
def test_metrics_segments(project):
    data = pd.read_csv("tests/data/aclew.csv")

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": set,
            "raw_filename": "file.rttm",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 4000,
            "format": "rttm",
        } for set in ["segments_vtc", "segments_alice", "segments_vcm"]]),
        import_function=partial(fake_vocs, data),
    )
    lm = pd.DataFrame(np.array([
        ["voc_speaker", "segments_vtc", 'FEM'],
        ["voc_speaker", "segments_vtc", 'CHI'],
        ["voc_speaker_ph", "segments_vtc", 'FEM'],
        ["voc_speaker_ph", "segments_vtc", 'CHI'],
        ["wc_speaker_ph", "segments_alice", 'FEM'],
        ["lp_n", "segments_vcm", pd.NA],
        ["lp_dur", "segments_vcm", pd.NA],
    ]),
                      columns=["callable", "set", "speaker"])
    metrics = Metrics(project,
                      metrics_list=lm,
                      by="segments",
                      rec_cols='date_iso',
                      child_cols='experiment,child_dob',
                      segments='tests/data/segments.csv')
    metrics.extract()

    truth = pd.read_csv("tests/truth/segments_metrics.csv")

    pd.testing.assert_frame_equal(metrics.metrics, truth, check_like=True)
def test_clipping(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/input.csv')
    am.import_annotations(input_annotations)
    am.read()

    start = 1981
    stop = 1984
    segments = am.get_segments(
        am.annotations[am.annotations['set'] == 'vtc_rttm'])
    segments = am.clip_segments(segments, start, stop)

    assert segments['segment_onset'].between(
        start, stop).all() and segments['segment_offset'].between(
            start, stop).all(), "segments not properly clipped"
    assert segments.shape[0] == 2, "got {} segments, expected 2".format(
        segments.shape[0])
Пример #16
0
def test_clipping(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    input_annotations = input_annotations[input_annotations["recording_filename"] == "sound.wav"]
    am.import_annotations(input_annotations[input_annotations["set"] == "vtc_rttm"])
    am.read()

    start = 1981000
    stop = 1984000
    segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"])
    segments = am.clip_segments(segments, start, stop)

    assert (
        segments["segment_onset"].between(start, stop).all()
        and segments["segment_offset"].between(start, stop).all()
    ), "segments not properly clipped"
    assert segments.shape[0] == 2, "got {} segments, expected 2".format(
        segments.shape[0]
    )
Пример #17
0
def test_set_from_path(project):
    am = AnnotationManager(project)

    assert am.set_from_path(os.path.join(project.path, "annotations/set")) == "set"
    assert am.set_from_path(os.path.join(project.path, "annotations/set/")) == "set"
    assert (
        am.set_from_path(os.path.join(project.path, "annotations/set/subset"))
        == "set/subset"
    )
    assert (
        am.set_from_path(os.path.join(project.path, "annotations/set/subset/converted"))
        == "set/subset"
    )
    assert (
        am.set_from_path(os.path.join(project.path, "annotations/set/subset/raw"))
        == "set/subset"
    )
Пример #18
0
def test_custom_importation(project):
    am = AnnotationManager(project)
    input = pd.DataFrame(
        [
            {
                "set": "vtc_rttm",
                "range_onset": 0,
                "range_offset": 4000,
                "recording_filename": "sound.wav",
                "time_seek": 0,
                "raw_filename": "example.rttm",
                "format": "custom",
            }
        ]
    )

    am.import_annotations(input, import_function=custom_function)
    am.read()

    errors, warnings = am.validate()
    assert len(errors) == 0
def test_vc_stats(project, turntakingthresh):
    am = AnnotationManager(project)
    am.import_annotations(
        pd.read_csv('examples/valid_raw_data/raw_annotations/input.csv'))

    raw_rttm = 'example_metrics.rttm'
    segments = am.annotations[am.annotations['raw_filename'] == raw_rttm]

    vc = am.get_vc_stats(am.get_segments(segments),
                         turntakingthresh=turntakingthresh).reset_index()
    truth_vc = pd.read_csv(
        'tests/truth/vc_truth_{:.1f}.csv'.format(turntakingthresh))

    pd.testing.assert_frame_equal(
        vc.reset_index().sort_index(axis=1).sort_values(vc.columns.tolist()),
        truth_vc.reset_index().sort_index(axis=1).sort_values(
            vc.columns.tolist()),
        atol=3)
def test_intersect(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv(
        'examples/valid_raw_data/raw_annotations/intersect.csv')
    am.import_annotations(input_annotations)
    am.read()

    a, b = am.intersection(am.annotations[am.annotations['set'] == 'textgrid'],
                           am.annotations[am.annotations['set'] == 'vtc_rttm'])

    pd.testing.assert_frame_equal(
        a.sort_index(axis=1).sort_values(a.columns.tolist()).reset_index(
            drop=True).drop(columns=['imported_at']),
        pd.read_csv('tests/truth/intersect_a.csv').sort_index(
            axis=1).sort_values(a.columns.tolist()).reset_index(
                drop=True).drop(columns=['imported_at']))

    pd.testing.assert_frame_equal(
        b.sort_index(axis=1).sort_values(b.columns.tolist()).reset_index(
            drop=True).drop(columns=['imported_at']),
        pd.read_csv('tests/truth/intersect_b.csv').sort_index(
            axis=1).sort_values(b.columns.tolist()).reset_index(
                drop=True).drop(columns=['imported_at']))
Пример #21
0
    def run(
        self,
        destination: str,
        segments: str,
        eaf_type: str,
        template: str,
        context_onset: int = 0,
        context_offset: int = 0,
        path: str = None,
        import_speech_from: str = None,
        **kwargs,
    ):
        """generate .eaf templates based on intervals to code.

        :param path: project path
        :type path: str
        :param destination: eaf destination
        :type destination: str
        :param segments: path to the input segments dataframe
        :type segments: str
        :param eaf_type: eaf-type [random, periodic]
        :type eaf_type: str
        :param template: name of the template to use (basic, native, or non-native)
        :type template: str
        :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context
        :type context_onset: int
        :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context
        :type context_offset: int
        """

        try:
            from importlib import resources
        except ImportError:
            # TODO: Perhaps add this as a dependency to the resources?
            import importlib_resources as resources

        etf_path = "{}.etf".format(template)
        pfsx_path = "{}.pfsx".format(template)

        if template in ["basic", "native", "non-native"]:
            with resources.path("ChildProject.templates", etf_path) as etf:
                etf_path = str(etf)

            with resources.path("ChildProject.templates", pfsx_path) as pfsx:
                pfsx_path = str(pfsx)

        if not os.path.exists(etf_path):
            raise Exception("{} cannot be found".format(etf_path))

        if not os.path.exists(pfsx_path):
            raise Exception("{} cannot be found".format(pfsx_path))

        print("making the " + eaf_type + " eaf file and csv")

        segments = pd.read_csv(segments)

        assert_dataframe("segments", segments, not_empty=True)
        assert_columns_presence(
            "segments",
            segments,
            {"recording_filename", "segment_onset", "segment_offset"},
        )

        imported_set = None
        prefill = path and import_speech_from
        if prefill:
            project = ChildProject(path)
            am = AnnotationManager(project)
            am.read()
            imported_set = import_speech_from

        for recording_filename, segs in segments.groupby("recording_filename"):
            recording_prefix = os.path.splitext(recording_filename)[0]
            output_filename = (recording_prefix + "_" + eaf_type + "_" +
                               os.path.basename(template))

            # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here.
            timestamps = [(on, off) for on, off in
                          segs.loc[:,
                                   ["segment_onset", "segment_offset"]].values]

            speech_segments = None
            imported_format = None
            if prefill:
                ranges = segs.assign(
                    recording_filename=recording_filename).rename(
                        columns={
                            "segment_onset": "range_onset",
                            "segment_offset": "range_offset",
                        })
                matches = am.get_within_ranges(ranges, [import_speech_from],
                                               'warn')

                if len(matches) == 0:
                    continue

                speech_segments = am.get_segments(matches)
                try:
                    matches = matches["format"].drop_duplicates()
                    if len(matches.index) == 1:
                        imported_format = matches.iloc[0]
                except KeyError:
                    imported_format = None

            output_dir = os.path.join(destination, recording_prefix)

            create_eaf(
                etf_path,
                output_filename,
                output_dir,
                recording_filename,
                timestamps,
                eaf_type,
                context_onset,
                context_offset,
                template,
                speech_segments,
                imported_set,
                imported_format,
            )

            shutil.copy(
                pfsx_path,
                os.path.join(output_dir, "{}.pfsx".format(output_filename)))
Пример #22
0
def test_rename(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    am.import_annotations(input_annotations[input_annotations["set"] == "textgrid"])
    am.read()
    tg_count = am.annotations[am.annotations["set"] == "textgrid"].shape[0]

    am.rename_set("textgrid", "renamed")
    am.read()

    errors, warnings = am.validate()
    assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected"

    assert am.annotations[am.annotations["set"] == "textgrid"].shape[0] == 0
    assert am.annotations[am.annotations["set"] == "renamed"].shape[0] == tg_count
Пример #23
0
 def _load(self):
     self.project = ChildProject(self.ds.path)
     self.am = AnnotationManager(self.project)
     self.am.read()
Пример #24
0
from ChildProject.annotations import AnnotationManager
from ChildProject.metrics import gamma, segments_to_grid, grid_to_vector, vectors_to_annotation_task

import argparse

parser = argparse.ArgumentParser(
    description=
    'compute agreement measures for all given annotators over a whole dataset')
parser.add_argument('path', help='path to the dataset')
parser.add_argument('--sets', nargs='+', help='sets to include')
args = parser.parse_args()

speakers = ['CHI', 'OCH', 'FEM', 'MAL']

project = ChildProject(args.path)
am = AnnotationManager(project)
am.read()

intersection = AnnotationManager.intersection(am.annotations, args.sets)
segments = am.get_collapsed_segments(intersection)

segments = segments[segments['speaker_type'].isin(speakers)]

vectors = [
    grid_to_vector(
        segments_to_grid(segments[segments['set'] == s],
                         0,
                         segments['segment_offset'].max(),
                         100,
                         'speaker_type',
                         speakers,
Пример #25
0
    def extract_chunks(self,
                       destination,
                       path,
                       annotation_set='vtc',
                       batch_size=1000,
                       target_speaker_type='CHI',
                       sample_size=500,
                       chunk_length=500,
                       threads=0,
                       batches=0,
                       **kwargs):

        assert 1000 % chunk_length == 0, 'chunk_length should divide 1000'

        self.destination = destination
        self.project = ChildProject(path)

        batch_size = int(batch_size)
        sample_size = int(sample_size)
        chunk_length = int(chunk_length)
        threads = int(threads)

        self.sample_size = sample_size
        self.chunk_length = chunk_length

        am = AnnotationManager(self.project)
        self.annotations = am.annotations
        self.annotations = self.annotations[self.annotations['set'] ==
                                            annotation_set]
        self.segments = am.get_segments(self.annotations)
        self.segments = self.segments[self.segments['speaker_type'] ==
                                      target_speaker_type]
        self.segments['segment_onset'] = self.segments[
            'segment_onset'] + self.segments['time_seek']
        self.segments['segment_offset'] = self.segments[
            'segment_offset'] + self.segments['time_seek']

        destination_path = os.path.join(destination, 'chunks')
        os.makedirs(destination_path, exist_ok=True)
        if os.listdir(destination_path):
            raise ValueError(
                "destination '{}' is not empty, please choose another destination."
                .format(destination_path))

        segments = []
        for _recording, _segments in self.segments.groupby(
                'recording_filename'):
            segments.append(_segments.assign(recording_filename=_recording))

        pool = mp.Pool(threads if threads > 0 else mp.cpu_count())
        self.chunks = pool.map(self.split_recording, segments)
        self.chunks = itertools.chain.from_iterable(self.chunks)
        self.chunks = pd.DataFrame([{
            'recording':
            c.recording,
            'onset':
            c.onset,
            'offset':
            c.offset,
            'wav':
            c.getbasename('wav'),
            'mp3':
            c.getbasename('mp3'),
            'speaker_type':
            target_speaker_type,
            'date_extracted':
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'uploaded':
            False,
            'project_slug':
            '',
            'subject_set':
            '',
            'zooniverse_id':
            0
        } for c in self.chunks])

        # shuffle chunks so that they can't be joined back together
        # based on Zooniverse subject IDs
        self.chunks = self.chunks.sample(frac=1).reset_index(drop=True)
        self.chunks['batch'] = self.chunks.index.map(
            lambda x: int(x / batch_size))
        self.chunks.index.name = 'index'
        self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv'))
Пример #26
0
def test_merge(project):
    am = AnnotationManager(project)

    input_annotations = pd.read_csv("examples/valid_raw_data/annotations/input.csv")
    input_annotations = input_annotations[
        input_annotations["set"].isin(["vtc_rttm", "alice"])
    ]
    print(input_annotations)
    am.import_annotations(input_annotations)
    am.read()

    print(am.annotations)
    am.read()
    am.merge_sets(
        left_set="vtc_rttm",
        right_set="alice",
        left_columns=["speaker_type"],
        right_columns=["phonemes", "syllables", "words"],
        output_set="alice_vtc",
        full_set_merge = False,
        recording_filter = {'sound.wav'}
    )
    am.read()

    anns = am.annotations[am.annotations['set'] == 'alice_vtc']
    assert anns.shape[0] == 1
    assert anns.iloc[0]['recording_filename'] == 'sound.wav'
    
    time.sleep(2) #sleeping for 2 seconds to have different 'imported_at' values so that can make sure both merge did fine
    
    am.merge_sets(
        left_set="vtc_rttm",
        right_set="alice",
        left_columns=["speaker_type"],
        right_columns=["phonemes", "syllables", "words"],
        output_set="alice_vtc",
        full_set_merge = False,
        skip_existing = True
    )
    am.read()
    
    anns = am.annotations[am.annotations['set'] == 'alice_vtc']
    assert anns.shape[0] == 2
    assert set(anns['recording_filename'].unique()) == {'sound.wav','sound2.wav'}
    assert anns.iloc[0]['imported_at'] != anns.iloc[1]['imported_at']
    
    segments = am.get_segments(am.annotations[am.annotations["set"] == "alice_vtc"])
    vtc_segments = am.get_segments(am.annotations[am.annotations["set"] == "vtc_rttm"])
    assert segments.shape[0] == vtc_segments.shape[0]
    assert segments.shape[1] == vtc_segments.shape[1] + 3

    adult_segments = (
        segments[segments["speaker_type"].isin(["FEM", "MAL"])]
        .sort_values(["segment_onset", "segment_offset"])
        .reset_index(drop=True)
    )
    alice = (
        am.get_segments(am.annotations[am.annotations["set"] == "alice"])
        .sort_values(["segment_onset", "segment_offset"])
        .reset_index(drop=True)
    )

    pd.testing.assert_frame_equal(
        adult_segments[["phonemes", "syllables", "words"]],
        alice[["phonemes", "syllables", "words"]],
    )
Пример #27
0
def test_periodic(project):
    """
    os.makedirs('output/eaf', exist_ok = True)

    project = ChildProject('examples/valid_raw_data')
    project.read()
    
    am = AnnotationManager(project)
    am.read()
    """
    data = pd.read_csv("tests/data/eaf_segments.csv")

    am = AnnotationManager(project)
    am.import_annotations(
        pd.DataFrame([{
            "set": "vtc",
            "raw_filename": "file.rttm",
            "time_seek": 0,
            "recording_filename": "sound.wav",
            "range_onset": 0,
            "range_offset": 4000,
            "format": "vtc_rttm",
        }]),
        import_function=partial(fake_vocs, data),
    )

    sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav'])
    sampler.sample()
    sampler.segments.to_csv('output/eaf/segments.csv')

    ranges = sampler.segments.rename(columns={
        "segment_onset": "range_onset",
        "segment_offset": "range_offset",
    })
    annotations = am.get_within_ranges(ranges, [IMP_FROM], 'warn')
    #annotations = am.annotations[am.annotations["set"] == IMP_FROM].drop_duplicates(['set', 'recording_filename', 'time_seek', 'range_onset', 'range_offset', 'raw_filename', 'format', 'filter'],ignore_index=True)
    annot_segments = am.get_segments(annotations)

    eaf_builder = EafBuilderPipeline()
    eaf_builder.run(
        destination='output/eaf',
        segments='output/eaf/segments.csv',
        eaf_type='periodic',
        template='basic',
        context_onset=250,
        context_offset=250,
        path='output/eaf',
        import_speech_from='vtc',
    )

    eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf')

    code = eaf.tiers['code_periodic'][0]
    segments = []

    for pid in code:
        (start_ts, end_ts, value, svg_ref) = code[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        sampler.segments[['segment_onset', 'segment_offset'
                          ]].sort_values(['segment_onset', 'segment_offset'
                                          ]).reset_index(drop=True))

    segments = []
    vtc_speech = eaf.tiers['VTC-SPEECH'][0]
    for pid in vtc_speech:
        (start_ts, end_ts, value, svg_ref) = vtc_speech[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    speech_segs = annot_segments[pd.isnull(annot_segments['speaker_type'])]

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        speech_segs[['segment_onset', 'segment_offset'
                     ]].sort_values(['segment_onset',
                                     'segment_offset']).reset_index(drop=True))

    segments = []
    vtc_chi = eaf.tiers['VTC-CHI'][0]
    for pid in vtc_chi:
        (start_ts, end_ts, value, svg_ref) = vtc_chi[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    chi_segs = annot_segments[annot_segments['speaker_type'] == 'CHI']

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        chi_segs[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True))

    segments = []
    vtc_och = eaf.tiers['VTC-OCH'][0]
    for pid in vtc_och:
        (start_ts, end_ts, value, svg_ref) = vtc_och[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    och_segs = annot_segments[annot_segments['speaker_type'] == 'OCH']

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        och_segs[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True))

    segments = []
    vtc_fem = eaf.tiers['VTC-FEM'][0]
    for pid in vtc_fem:
        (start_ts, end_ts, value, svg_ref) = vtc_fem[pid]
        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
        segments.append({
            'segment_onset': int(start_t),
            'segment_offset': int(end_t)
        })

    segments = pd.DataFrame(segments)

    fem_segs = annot_segments[annot_segments['speaker_type'] == 'FEM']

    pd.testing.assert_frame_equal(
        segments[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True),
        fem_segs[['segment_onset', 'segment_offset'
                  ]].sort_values(['segment_onset',
                                  'segment_offset']).reset_index(drop=True))

    assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'
Пример #28
0
class MetadataExtractor(BaseMetadataExtractor):
    def _load(self):
        self.project = ChildProject(self.ds.path)
        self.am = AnnotationManager(self.project)
        self.am.read()

    def _get_dsmeta(self, dataset, content):
        recordings = self.project.recordings
        children = self.project.children

        ## Extract experiment(s)
        experiment = None
        try:
            experiments = list(recordings['experiment'].unique())
            assert len(experiments) == 1
            experiment = experiments[0]
        except Exception as exc:
            lgr.error("could not determine the experiment ({})".format(str(exc)))

        dsmeta = {
            'experiment': experiment
        }

        ## Extract sample size
        dsmeta['total_children'] = children.shape[0]
        dsmeta['total_recordings'] = recordings.dropna(subset = ['recording_filename']).shape[0]
        dsmeta['total_duration'] = int(recordings['duration'].sum())

        ## Extract languages
        languages = []
        if 'language' in children.columns:
            languages.extend(children['language'].str.strip().tolist())
        
        if 'languages' in children.columns:
            languages.extend(np.ravel(children['languages'].str.split(';').map(lambda s: s.strip()).tolist()))

        dsmeta['languages'] = list(set(languages))

        ### Extract devices
        dsmeta['devices'] = list(recordings['recording_device_type'].dropna().unique())

        ### Vocabulary specifications
        context = {}
        context['childproject'] = {
            '@id': '#',
            'description': 'ad-hoc vocabulary for the ChildProject standard',
            'type': vocabulary_id,
        }
        context.update(vocabulary)
        dsmeta['@context'] = context

        return dsmeta

    def _get_cnmeta(self, dataset, content):
        cnmeta = []
        contents = [{'path': f, 'abspath': os.path.abspath(os.path.join(self.ds.path, f))} for f in self.paths]

        annotations = self.am.annotations

        annotations['abspath'] = annotations.apply(lambda row:
            os.path.join(
                self.project.path,
                'annotations',
                row['set'],
                'converted',
                row['annotation_filename']
            ),
            axis = 1
        )
        annotations['abspath'] = annotations['abspath'].apply(os.path.abspath)
        annotations.sort_values('imported_at', inplace = True)
        annotations.drop_duplicates(
            'abspath',
            keep = 'last',
            inplace = True
        )
        annotations = annotations.merge(
            pd.DataFrame(contents),
            how = 'inner',
            left_on = 'abspath',
            right_on = 'abspath'
        )
        annotations['columns'] = annotations['abspath'].apply(lambda f:
            ','.join(pd.read_csv(f).dropna(axis=1, how='all').columns)
        )
        
        cnmeta.extend([
            (
                annotation['path'],
                {
                    'set': annotation['set'],
                    'format': annotation['format'],
                    'data': annotation['columns'],
                    'package_version': annotation['package_version'],
                    'duration': annotation['range_offset']-annotation['range_onset']
                }
            )
            for annotation in annotations.to_dict(orient = 'records')
        ])
        
        return cnmeta

    def get_metadata(self, dataset, content):
        try:
            self._load()
        except Exception as exc:
            lgr.error("could not read the metadata due to some exception.\n{}".format(str(exc)))
            return {}, []

        dsmeta = self._get_dsmeta(dataset, content)
        cnmeta = self._get_cnmeta(dataset, content) if content else []

        return (dsmeta, cnmeta)
Пример #29
0
from ChildProject.annotations import AnnotationManager

import argparse
import os

parser = argparse.ArgumentParser(
    description='import and convert VTC annotations into the project')
parser.add_argument("--source", help="project path", required=True)
parser.add_argument("--overwrite",
                    help="project path",
                    dest='overwrite',
                    action='store_true')
args = parser.parse_args()

project = ChildProject(args.source)
am = AnnotationManager(project)

if args.overwrite:
    am.remove_set('vtc')

input = project.recordings[['filename']]
input.rename(columns={'filename': 'recording_filename'}, inplace=True)
input = input[input['recording_filename'] != 'NA']
input['set'] = 'vtc'
input['time_seek'] = 0
input['range_onset'] = 0
input['range_offset'] = 0
input['raw_filename'] = input['recording_filename'].apply(
    lambda s: os.path.join('vtc', s + '.rttm'))
input['format'] = 'vtc_rttm'
Пример #30
0
#!/usr/bin/env python3
from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager

import argparse
import os

parser = argparse.ArgumentParser(description='import and convert VTC annotations into the project')
parser.add_argument("--source", help = "project path", required = True)
parser.add_argument("--set", help = "annotation set. the rttm files should lie in <source>/annotations/<set>/raw/", default = 'vtc')
parser.add_argument("--overwrite", help = "project path", dest = 'overwrite', action = 'store_true')
args = parser.parse_args()

project = ChildProject(args.source)
am = AnnotationManager(project)

if args.overwrite:
    am.remove_set(args.set)

input = project.recordings[['recording_filename', 'duration']]
input = input[input['recording_filename'] != 'NA']
input['set'] = args.set
input['time_seek'] = 0
input['range_onset'] = 0
input['range_offset'] = input['duration']
input['raw_filename'] = input['recording_filename'].apply(lambda s: os.path.splitext(s)[0] + '.rttm')
input['format'] = 'vtc_rttm'

am.import_annotations(input, threads = 4)