Пример #1
0
def project(request):
    if not os.path.exists("output/samplers"):
        shutil.copytree(src="examples/valid_raw_data", dst="output/samplers")

    project = ChildProject("output/samplers")
    project.read()
    yield project
Пример #2
0
def convert(args):
    """convert recordings to a given format"""
    profile = RecordingProfile(name=args.name,
                               format=args.format,
                               codec=args.codec,
                               sampling=args.sampling,
                               split=args.split)

    project = ChildProject(args.source)
    results = project.convert_recordings(profile,
                                         skip_existing=args.skip_existing,
                                         threads=args.threads)

    for error in project.errors:
        print("error: {}".format(error), file=sys.stderr)

    for warning in project.warnings:
        print("warning: {}".format(warning))

    if len(project.errors) > 0:
        print("conversion failed, {} error(s) occured".format(
            len(project.errors)),
              file=sys.stderr)
        print("cannot convert recordings", file=sys.stderr)
        sys.exit(1)

    print("recordings successfully converted to '{}'".format(
        os.path.join(project.path, 'converted_recordings', profile.name)))
def test_valid_project():
    project = ChildProject("examples/valid_raw_data")
    errors, warnings = project.validate_input_data()

    assert len(errors) == 0, "valid input validation failed (expected to pass)"
    assert len(warnings) == 1, "expected 1 warning, got {}".format(
        len(warnings))
Пример #4
0
def compute_durations(args):
    """creates a 'duration' column into metadata/recordings"""
    project = ChildProject(args.source)

    errors, warnings = project.validate_input_data()

    if len(errors) > 0:
        print("validation failed, {} error(s) occured".format(len(errors)),
              file=sys.stderr)
        sys.exit(1)

    if 'duration' in project.recordings.columns:
        if not args.force:
            print("duration exists, aborting")
            return

        project.recordings.drop(columns=['duration'], inplace=True)

    durations = project.compute_recordings_duration().dropna()

    recordings = project.recordings.merge(
        durations[durations['filename'] != 'NA'],
        how='left',
        left_on='filename',
        right_on='filename')
    recordings.to_csv(os.path.join(project.path, 'metadata/recordings.csv'),
                      index=False)
Пример #5
0
def import_annotations(args):
    """convert and import a set of annotations"""

    project = ChildProject(args.source)
    errors, warnings = project.validate_input_data()

    if len(errors) > 0:
        print("validation failed, {} error(s) occured".format(len(errors)),
              file=sys.stderr)
        sys.exit(1)

    if args.annotations:
        annotations = pd.read_csv(args.annotations)
    else:
        annotations = pd.DataFrame([{
            col.name: getattr(args, col.name)
            for col in AnnotationManager.INDEX_COLUMNS if not col.generated
        }])

    am = AnnotationManager(project)
    am.import_annotations(annotations)

    errors, warnings = am.validate()

    if len(am.errors) > 0:
        print("importation completed with {} errors and {} warnings".format(
            len(am.errors) + len(errors), len(warnings)),
              file=sys.stderr)
        print("\n".join(am.errors), file=sys.stderr)
        print("\n".join(errors), file=sys.stderr)
        print("\n".join(warnings))
Пример #6
0
    def run(
        self,
        path: str,
        input_set: str,
        output_set: str,
        replacements_json_dict: str = "",
        **kwargs
    ):
        """Anonymize a set of its annotations (`input_set`) and saves it as `output_set`."""

        if input_set == output_set:
            raise Exception("input_set and output_set should not be equal")

        project = ChildProject(path)
        project.read()

        replacements = self.DEFAULT_REPLACEMENTS
        if replacements_json_dict:
            replacements = json.load(open(replacements_json_dict, "r"))

        input_set_path = os.path.join(project.path, "annotations", input_set, "raw")
        output_set_path = os.path.join(project.path, "annotations", output_set, "raw")

        if os.path.exists(output_set_path):
            raise Exception("destination {} already exists".format(output_set_path))

        its_files = glob.glob(os.path.join(input_set_path, "**/*.*"), recursive=True)
        for its in its_files:
            inFile = its
            outFile = os.path.join(
                output_set_path, its[len(os.path.join(input_set_path, "")) :]
            )
            os.makedirs(os.path.dirname(outFile), exist_ok=True)

            with open(inFile, "r") as inF:
                with open(outFile, "w") as outF:
                    for line in inF:
                        for node in replacements.keys():
                            if re.search(
                                r"<{}\b".format(node), line
                            ):  # word boundary is important here
                                for name, value in replacements[node].items():
                                    if isinstance(value, list):
                                        if bool(value[1]["only_time"]) is True:
                                            line = re.sub(
                                                r'{}="[0-9\-]*'.format(name),
                                                r'{}="{}'.format(
                                                    name, value[0]["replace_value"]
                                                ),
                                                line,
                                            )
                                            continue

                                    line = re.sub(
                                        r'{}="[a-zA-Z0-9_.:\-]*"'.format(name),
                                        r'{}="{}"'.format(name, value),
                                        line,
                                    )
                        outF.write(line)
Пример #7
0
def test_read():
    project = ChildProject("examples/valid_raw_data")
    project.read()

    doc = project.read_documentation()
    truth = pd.read_csv("tests/truth/docs.csv")

    pd.testing.assert_frame_equal(
        standardize_dataframe(doc, columns=truth.columns),
        standardize_dataframe(truth, columns=truth.columns),
    )
Пример #8
0
def stats(args):
    project = ChildProject(args.source)

    errors, warnings = project.validate_input_data()

    if len(errors) > 0:
        print("validation failed, {} error(s) occured".format(len(errors)),
              file=sys.stderr)
        sys.exit(1)

    stats = project.get_stats()
    args.stats = args.stats.split(',') if args.stats else []

    for stat in stats:
        if not args.stats or stat in args.stats:
            print("{}: {}".format(stat, stats[stat]))
Пример #9
0
def validate(args):
    """validate the consistency of the dataset returning detailed errors and warnings"""

    project = ChildProject(args.source)
    errors, warnings = project.validate_input_data(args.ignore_files)

    for error in errors:
        print("error: {}".format(error), file=sys.stderr)

    for warning in warnings:
        print("warning: {}".format(warning))

    if len(errors) > 0:
        print("validation failed, {} error(s) occured".format(len(errors)),
              file=sys.stderr)
        sys.exit(1)
Пример #10
0
def test_series_to_datetime():
    project = ChildProject("examples/valid_raw_data")

    only_time = pd.Series(['3:12', '04:14', '12:19:21', '32:77'])
    only_date = pd.Series(
        ['2022-01-23', '2022-01-23', '2022-01-23', '2022-01-23'])

    truth = pd.Series([
        datetime.datetime(1900, 1, 1, 3, 12, 0, 0),
        datetime.datetime(1900, 1, 1, 4, 14, 0, 0),
        datetime.datetime(1900, 1, 1, 12, 19, 21, 0),
        pd.NaT,
    ])

    #convert to datetime using the formats for start_time listed in the project RECORDINGS_COLUMNS.Index(name == 'start_time')
    converted_time = series_to_datetime(only_time, project.RECORDINGS_COLUMNS,
                                        'start_time')

    pd.testing.assert_series_equal(converted_time, truth)

    truth = pd.Series([
        datetime.datetime(2022, 1, 23, 3, 12, 0, 0),
        datetime.datetime(2022, 1, 23, 4, 14, 0, 0),
        datetime.datetime(2022, 1, 23, 12, 19, 21, 0),
        pd.NaT,
    ])

    #convert to datetime using the formats for start_time and date_iso listed in the project RECORDINGS_COLUMNS.Index(name == 'start_time') and Index(name == 'date_iso')
    converted_time = series_to_datetime(only_time, project.RECORDINGS_COLUMNS,
                                        'start_time', only_date,
                                        project.RECORDINGS_COLUMNS, 'date_iso')

    pd.testing.assert_series_equal(converted_time, truth)
Пример #11
0
def test_import():
    project = ChildProject("examples/valid_raw_data")
    project.import_data("output/project")

    assert os.path.exists("output/project"), "project folder was not created"

    assert all([
        os.path.exists(os.path.join("output/project", f))
        for f in ['scripts', 'doc']
    ]), "not all folders were successfully created"

    assert (all([
        open(f, "r+").read() == open(
            os.path.join("output/project/",
                         f.replace("examples/valid_raw_data/", ""))).read()
        for f in glob.glob("examples/valid_raw_data/**.*")
    ])), "not all files were successfully copied"
Пример #12
0
def test_whitelist():
    project = ChildProject("examples/valid_raw_data")
    project.read()

    recordings = project.get_recordings_from_list(
        Pipeline.recordings_from_list(["sound.wav"]))
    assert recordings["recording_filename"].tolist() == ["sound.wav"]

    recordings = project.get_recordings_from_list(
        Pipeline.recordings_from_list(pd.Series(["sound.wav"])))
    assert recordings["recording_filename"].tolist() == ["sound.wav"]

    recordings = project.get_recordings_from_list(
        Pipeline.recordings_from_list(
            pd.DataFrame({"recording_filename": ["sound.wav"]})))
    assert recordings["recording_filename"].tolist() == ["sound.wav"]

    recordings = pd.DataFrame({
        "recording_filename": ["sound.wav"]
    }).to_csv("output/filter.csv")

    recordings = project.get_recordings_from_list(
        Pipeline.recordings_from_list("output/filter.csv"))
    assert recordings["recording_filename"].tolist() == ["sound.wav"]

    recordings = pd.DataFrame({
        "filename": ["sound.wav"]
    }).to_csv("output/filter.csv")

    caught_value_error = False
    try:
        recordings = project.get_recordings_from_list(
            Pipeline.recordings_from_list("output/filter.csv"))
    except ValueError:
        caught_value_error = True

    assert caught_value_error == True

    recordings = project.get_recordings_from_list(
        Pipeline.recordings_from_list([
            "examples/valid_raw_data/recordings/raw/sound.wav",
            "examples/valid_raw_data/recordings/raw/sound2.wav",
        ]))
    assert recordings["recording_filename"].tolist() == [
        "sound.wav", "sound2.wav"
    ]
Пример #13
0
def test_invalid_project():
    project = ChildProject("examples/invalid_raw_data")
    errors, warnings = project.validate()

    expected_errors = [
        "examples/invalid_raw_data/metadata/children.csv: child_id '1' appears 2 times in lines [2,3], should appear once",
        "cannot find recording 'test_1_20200918.mp3' at 'examples/invalid_raw_data/recordings/raw/test_1_20200918.mp3'",
        "examples/invalid_raw_data/metadata/recordings.csv: 'USB' is not a permitted value for column 'recording_device_type' on line 2, should be any of [lena,usb,olympus,babylogger,unknown]",
    ]

    expected_warnings = [
        "examples/invalid_raw_data/metadata/recordings.csv: '2' does not pass callable test for column 'noisy_setting' on line 2",
        "file 'examples/invalid_raw_data/recordings/raw/test_1_2020091.mp3' not indexed.",
    ]
    assert sorted(expected_errors) == sorted(
        errors), "errors do not match expected errors"
    assert sorted(expected_warnings) == sorted(
        warnings), "warnings do not match expected warnings"
Пример #14
0
def project(request):
    if not os.path.exists("output/annotations"):
        shutil.copytree(src="examples/valid_raw_data", dst="output/annotations")

    project = ChildProject("output/annotations")
    yield project

    os.remove("output/annotations/metadata/annotations.csv")
    for raw_annotation in glob.glob("output/annotations/annotations/*.*/converted"):
        shutil.rmtree(raw_annotation)
Пример #15
0
def test_enforce_dtypes():
    project = ChildProject("examples/valid_raw_data", enforce_dtypes=True)
    project.read()

    assert project.recordings["child_id"].dtype.kind == "O"
    assert project.children["child_id"].dtype.kind == "O"

    project = ChildProject("examples/valid_raw_data", enforce_dtypes=False)
    project.read()

    assert project.recordings["child_id"].dtype.kind == "i"
    assert project.children["child_id"].dtype.kind == "i"
Пример #16
0
def test_compute_ages():
    project = ChildProject("examples/valid_raw_data")
    project.read()

    project.recordings["age"] = project.compute_ages()

    truth = pd.read_csv("tests/truth/ages.csv").set_index("line")

    pd.testing.assert_frame_equal(project.recordings[["child_id", "age"]],
                                  truth[["child_id", "age"]])
def project(request):
    if not os.path.exists("output/annotations"):
        project = ChildProject("examples/valid_raw_data")
        project.import_data("output/annotations")

    project = ChildProject("output/annotations")
    yield project

    os.remove("output/annotations/metadata/annotations.csv")
    shutil.rmtree("output/annotations/annotations")
    os.mkdir("output/annotations/annotations")
Пример #18
0
def test_convert():
    project = ChildProject("examples/valid_raw_data")
    project.import_data("output/convert")
    project = ChildProject("output/convert")
    profile = project.convert_recordings(RecordingProfile(name='test'))

    recordings = project.recordings
    converted_recordings = profile.recordings

    assert np.isclose(
        4,
        project.compute_recordings_duration()
        ['duration'].sum()), "audio duration equals expected value"
    assert os.path.exists("output/convert/converted_recordings/test"
                          ), "missing converted recordings folder"
    assert recordings.shape[0] == converted_recordings.shape[
        0], "conversion table is incomplete"
    assert all(converted_recordings['success'].tolist()
               ), "not all recordings were successfully converted"
    assert all([
        os.path.exists(
            os.path.join("output/convert/converted_recordings/test", f))
        for f in converted_recordings['converted_filename'].tolist()
    ]), "recording files are missing"
import subprocess
import os
import wave
import time
import datetime

parser = argparse.ArgumentParser(description='')
parser.add_argument("--source", help = "path to project", required = True)
parser.add_argument("--profile", help = "audio profile to be used", default = "", required = False)
parser.add_argument("--mem", help = "slurm jobs memory in GB", default = 30, type = int)
parser.add_argument("--batch", default = 8, type = int)
parser.add_argument("--overwrite", help = "overwrite rttm if exists", default = False, required = False)
parser.add_argument("--recordings", help = "recordings whitelist", default = [], nargs = '+')
args = parser.parse_args()

project = ChildProject(args.source)
errors, warnings = project.validate()

if len(errors) > 0:
    print("validation failed, {} error(s) occured".format(len(errors)), file = sys.stderr)
    print("proceeding despite errors...")

audio_prefix = os.path.join('recordings/converted', args.profile) if args.profile else 'recordings/raw'

recordings = project.recordings

if args.recordings:
    recordings = project.get_recordings_from_list(Pipeline.recordings_from_list(args.recordings))

print('selected recordings:')
print(recordings)
Пример #20
0
parser = argparse.ArgumentParser(
    description=
    'split the top 10 minutes of each recording with the highest volubility into 500 ms chunks and upload them to zooniverse'
)
parser.add_argument("--source", help="project path", required=True)
parser.add_argument("--chunks-destination",
                    help="chunks destination",
                    required=True)
parser.add_argument("--set", help="annotation set", default='its')
parser.add_argument(
    "--project-id",
    help="id of the zooniverse project to upload the chunks to",
    default='')
args = parser.parse_args()

project = ChildProject(args.source)
project.read()

sampler = HighVolubilitySampler(project,
                                annotation_set=args.set,
                                metric='cvc',
                                windows_length=60 * 1000,
                                windows_count=10)
sampler.sample()
sampler.segments.to_csv('segments.csv')

zooniverse = ZooniversePipeline()

chunks_path = zooniverse.extract_chunks(path=project.path,
                                        destination=args.chunks_destination,
                                        keyword='example',
Пример #21
0
from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager

import argparse
import os

parser = argparse.ArgumentParser(
    description='import and convert VTC annotations into the project')
parser.add_argument("--source", help="project path", required=True)
parser.add_argument("--overwrite",
                    help="project path",
                    dest='overwrite',
                    action='store_true')
args = parser.parse_args()

project = ChildProject(args.source)
am = AnnotationManager(project)

if args.overwrite:
    am.remove_set('vtc')

input = project.recordings[['filename']]
input.rename(columns={'filename': 'recording_filename'}, inplace=True)
input = input[input['recording_filename'] != 'NA']
input['set'] = 'vtc'
input['time_seek'] = 0
input['range_onset'] = 0
input['range_offset'] = 0
input['raw_filename'] = input['recording_filename'].apply(
    lambda s: os.path.join('vtc', s + '.rttm'))
input['format'] = 'vtc_rttm'
Пример #22
0
 def _load(self):
     self.project = ChildProject(self.ds.path)
     self.am = AnnotationManager(self.project)
     self.am.read()
Пример #23
0
    def run(
        self,
        destination: str,
        segments: str,
        eaf_type: str,
        template: str,
        context_onset: int = 0,
        context_offset: int = 0,
        path: str = None,
        import_speech_from: str = None,
        **kwargs,
    ):
        """generate .eaf templates based on intervals to code.

        :param path: project path
        :type path: str
        :param destination: eaf destination
        :type destination: str
        :param segments: path to the input segments dataframe
        :type segments: str
        :param eaf_type: eaf-type [random, periodic]
        :type eaf_type: str
        :param template: name of the template to use (basic, native, or non-native)
        :type template: str
        :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context
        :type context_onset: int
        :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context
        :type context_offset: int
        """

        try:
            from importlib import resources
        except ImportError:
            # TODO: Perhaps add this as a dependency to the resources?
            import importlib_resources as resources

        etf_path = "{}.etf".format(template)
        pfsx_path = "{}.pfsx".format(template)

        if template in ["basic", "native", "non-native"]:
            with resources.path("ChildProject.templates", etf_path) as etf:
                etf_path = str(etf)

            with resources.path("ChildProject.templates", pfsx_path) as pfsx:
                pfsx_path = str(pfsx)

        if not os.path.exists(etf_path):
            raise Exception("{} cannot be found".format(etf_path))

        if not os.path.exists(pfsx_path):
            raise Exception("{} cannot be found".format(pfsx_path))

        print("making the " + eaf_type + " eaf file and csv")

        segments = pd.read_csv(segments)

        assert_dataframe("segments", segments, not_empty=True)
        assert_columns_presence(
            "segments",
            segments,
            {"recording_filename", "segment_onset", "segment_offset"},
        )

        imported_set = None
        prefill = path and import_speech_from
        if prefill:
            project = ChildProject(path)
            am = AnnotationManager(project)
            am.read()
            imported_set = import_speech_from

        for recording_filename, segs in segments.groupby("recording_filename"):
            recording_prefix = os.path.splitext(recording_filename)[0]
            output_filename = (recording_prefix + "_" + eaf_type + "_" +
                               os.path.basename(template))

            # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here.
            timestamps = [(on, off) for on, off in
                          segs.loc[:,
                                   ["segment_onset", "segment_offset"]].values]

            speech_segments = None
            imported_format = None
            if prefill:
                ranges = segs.assign(
                    recording_filename=recording_filename).rename(
                        columns={
                            "segment_onset": "range_onset",
                            "segment_offset": "range_offset",
                        })
                matches = am.get_within_ranges(ranges, [import_speech_from],
                                               'warn')

                if len(matches) == 0:
                    continue

                speech_segments = am.get_segments(matches)
                try:
                    matches = matches["format"].drop_duplicates()
                    if len(matches.index) == 1:
                        imported_format = matches.iloc[0]
                except KeyError:
                    imported_format = None

            output_dir = os.path.join(destination, recording_prefix)

            create_eaf(
                etf_path,
                output_filename,
                output_dir,
                recording_filename,
                timestamps,
                eaf_type,
                context_onset,
                context_offset,
                template,
                speech_segments,
                imported_set,
                imported_format,
            )

            shutil.copy(
                pfsx_path,
                os.path.join(output_dir, "{}.pfsx".format(output_filename)))
Пример #24
0
    def extract_chunks(self,
                       destination,
                       path,
                       annotation_set='vtc',
                       batch_size=1000,
                       target_speaker_type='CHI',
                       sample_size=500,
                       chunk_length=500,
                       threads=0,
                       batches=0,
                       **kwargs):

        assert 1000 % chunk_length == 0, 'chunk_length should divide 1000'

        self.destination = destination
        self.project = ChildProject(path)

        batch_size = int(batch_size)
        sample_size = int(sample_size)
        chunk_length = int(chunk_length)
        threads = int(threads)

        self.sample_size = sample_size
        self.chunk_length = chunk_length

        am = AnnotationManager(self.project)
        self.annotations = am.annotations
        self.annotations = self.annotations[self.annotations['set'] ==
                                            annotation_set]
        self.segments = am.get_segments(self.annotations)
        self.segments = self.segments[self.segments['speaker_type'] ==
                                      target_speaker_type]
        self.segments['segment_onset'] = self.segments[
            'segment_onset'] + self.segments['time_seek']
        self.segments['segment_offset'] = self.segments[
            'segment_offset'] + self.segments['time_seek']

        destination_path = os.path.join(destination, 'chunks')
        os.makedirs(destination_path, exist_ok=True)
        if os.listdir(destination_path):
            raise ValueError(
                "destination '{}' is not empty, please choose another destination."
                .format(destination_path))

        segments = []
        for _recording, _segments in self.segments.groupby(
                'recording_filename'):
            segments.append(_segments.assign(recording_filename=_recording))

        pool = mp.Pool(threads if threads > 0 else mp.cpu_count())
        self.chunks = pool.map(self.split_recording, segments)
        self.chunks = itertools.chain.from_iterable(self.chunks)
        self.chunks = pd.DataFrame([{
            'recording':
            c.recording,
            'onset':
            c.onset,
            'offset':
            c.offset,
            'wav':
            c.getbasename('wav'),
            'mp3':
            c.getbasename('mp3'),
            'speaker_type':
            target_speaker_type,
            'date_extracted':
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'uploaded':
            False,
            'project_slug':
            '',
            'subject_set':
            '',
            'zooniverse_id':
            0
        } for c in self.chunks])

        # shuffle chunks so that they can't be joined back together
        # based on Zooniverse subject IDs
        self.chunks = self.chunks.sample(frac=1).reset_index(drop=True)
        self.chunks['batch'] = self.chunks.index.map(
            lambda x: int(x / batch_size))
        self.chunks.index.name = 'index'
        self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv'))
Пример #25
0
from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
from ChildProject.metrics import gamma, segments_to_grid, grid_to_vector, vectors_to_annotation_task

import argparse

parser = argparse.ArgumentParser(
    description=
    'compute agreement measures for all given annotators over a whole dataset')
parser.add_argument('path', help='path to the dataset')
parser.add_argument('--sets', nargs='+', help='sets to include')
args = parser.parse_args()

speakers = ['CHI', 'OCH', 'FEM', 'MAL']

project = ChildProject(args.path)
am = AnnotationManager(project)
am.read()

intersection = AnnotationManager.intersection(am.annotations, args.sets)
segments = am.get_collapsed_segments(intersection)

segments = segments[segments['speaker_type'].isin(speakers)]

vectors = [
    grid_to_vector(
        segments_to_grid(segments[segments['set'] == s],
                         0,
                         segments['segment_offset'].max(),
                         100,
                         'speaker_type',