def project(request): if not os.path.exists("output/samplers"): shutil.copytree(src="examples/valid_raw_data", dst="output/samplers") project = ChildProject("output/samplers") project.read() yield project
def convert(args): """convert recordings to a given format""" profile = RecordingProfile(name=args.name, format=args.format, codec=args.codec, sampling=args.sampling, split=args.split) project = ChildProject(args.source) results = project.convert_recordings(profile, skip_existing=args.skip_existing, threads=args.threads) for error in project.errors: print("error: {}".format(error), file=sys.stderr) for warning in project.warnings: print("warning: {}".format(warning)) if len(project.errors) > 0: print("conversion failed, {} error(s) occured".format( len(project.errors)), file=sys.stderr) print("cannot convert recordings", file=sys.stderr) sys.exit(1) print("recordings successfully converted to '{}'".format( os.path.join(project.path, 'converted_recordings', profile.name)))
def test_valid_project(): project = ChildProject("examples/valid_raw_data") errors, warnings = project.validate_input_data() assert len(errors) == 0, "valid input validation failed (expected to pass)" assert len(warnings) == 1, "expected 1 warning, got {}".format( len(warnings))
def compute_durations(args): """creates a 'duration' column into metadata/recordings""" project = ChildProject(args.source) errors, warnings = project.validate_input_data() if len(errors) > 0: print("validation failed, {} error(s) occured".format(len(errors)), file=sys.stderr) sys.exit(1) if 'duration' in project.recordings.columns: if not args.force: print("duration exists, aborting") return project.recordings.drop(columns=['duration'], inplace=True) durations = project.compute_recordings_duration().dropna() recordings = project.recordings.merge( durations[durations['filename'] != 'NA'], how='left', left_on='filename', right_on='filename') recordings.to_csv(os.path.join(project.path, 'metadata/recordings.csv'), index=False)
def import_annotations(args): """convert and import a set of annotations""" project = ChildProject(args.source) errors, warnings = project.validate_input_data() if len(errors) > 0: print("validation failed, {} error(s) occured".format(len(errors)), file=sys.stderr) sys.exit(1) if args.annotations: annotations = pd.read_csv(args.annotations) else: annotations = pd.DataFrame([{ col.name: getattr(args, col.name) for col in AnnotationManager.INDEX_COLUMNS if not col.generated }]) am = AnnotationManager(project) am.import_annotations(annotations) errors, warnings = am.validate() if len(am.errors) > 0: print("importation completed with {} errors and {} warnings".format( len(am.errors) + len(errors), len(warnings)), file=sys.stderr) print("\n".join(am.errors), file=sys.stderr) print("\n".join(errors), file=sys.stderr) print("\n".join(warnings))
def run( self, path: str, input_set: str, output_set: str, replacements_json_dict: str = "", **kwargs ): """Anonymize a set of its annotations (`input_set`) and saves it as `output_set`.""" if input_set == output_set: raise Exception("input_set and output_set should not be equal") project = ChildProject(path) project.read() replacements = self.DEFAULT_REPLACEMENTS if replacements_json_dict: replacements = json.load(open(replacements_json_dict, "r")) input_set_path = os.path.join(project.path, "annotations", input_set, "raw") output_set_path = os.path.join(project.path, "annotations", output_set, "raw") if os.path.exists(output_set_path): raise Exception("destination {} already exists".format(output_set_path)) its_files = glob.glob(os.path.join(input_set_path, "**/*.*"), recursive=True) for its in its_files: inFile = its outFile = os.path.join( output_set_path, its[len(os.path.join(input_set_path, "")) :] ) os.makedirs(os.path.dirname(outFile), exist_ok=True) with open(inFile, "r") as inF: with open(outFile, "w") as outF: for line in inF: for node in replacements.keys(): if re.search( r"<{}\b".format(node), line ): # word boundary is important here for name, value in replacements[node].items(): if isinstance(value, list): if bool(value[1]["only_time"]) is True: line = re.sub( r'{}="[0-9\-]*'.format(name), r'{}="{}'.format( name, value[0]["replace_value"] ), line, ) continue line = re.sub( r'{}="[a-zA-Z0-9_.:\-]*"'.format(name), r'{}="{}"'.format(name, value), line, ) outF.write(line)
def test_read(): project = ChildProject("examples/valid_raw_data") project.read() doc = project.read_documentation() truth = pd.read_csv("tests/truth/docs.csv") pd.testing.assert_frame_equal( standardize_dataframe(doc, columns=truth.columns), standardize_dataframe(truth, columns=truth.columns), )
def stats(args): project = ChildProject(args.source) errors, warnings = project.validate_input_data() if len(errors) > 0: print("validation failed, {} error(s) occured".format(len(errors)), file=sys.stderr) sys.exit(1) stats = project.get_stats() args.stats = args.stats.split(',') if args.stats else [] for stat in stats: if not args.stats or stat in args.stats: print("{}: {}".format(stat, stats[stat]))
def validate(args): """validate the consistency of the dataset returning detailed errors and warnings""" project = ChildProject(args.source) errors, warnings = project.validate_input_data(args.ignore_files) for error in errors: print("error: {}".format(error), file=sys.stderr) for warning in warnings: print("warning: {}".format(warning)) if len(errors) > 0: print("validation failed, {} error(s) occured".format(len(errors)), file=sys.stderr) sys.exit(1)
def test_series_to_datetime(): project = ChildProject("examples/valid_raw_data") only_time = pd.Series(['3:12', '04:14', '12:19:21', '32:77']) only_date = pd.Series( ['2022-01-23', '2022-01-23', '2022-01-23', '2022-01-23']) truth = pd.Series([ datetime.datetime(1900, 1, 1, 3, 12, 0, 0), datetime.datetime(1900, 1, 1, 4, 14, 0, 0), datetime.datetime(1900, 1, 1, 12, 19, 21, 0), pd.NaT, ]) #convert to datetime using the formats for start_time listed in the project RECORDINGS_COLUMNS.Index(name == 'start_time') converted_time = series_to_datetime(only_time, project.RECORDINGS_COLUMNS, 'start_time') pd.testing.assert_series_equal(converted_time, truth) truth = pd.Series([ datetime.datetime(2022, 1, 23, 3, 12, 0, 0), datetime.datetime(2022, 1, 23, 4, 14, 0, 0), datetime.datetime(2022, 1, 23, 12, 19, 21, 0), pd.NaT, ]) #convert to datetime using the formats for start_time and date_iso listed in the project RECORDINGS_COLUMNS.Index(name == 'start_time') and Index(name == 'date_iso') converted_time = series_to_datetime(only_time, project.RECORDINGS_COLUMNS, 'start_time', only_date, project.RECORDINGS_COLUMNS, 'date_iso') pd.testing.assert_series_equal(converted_time, truth)
def test_import(): project = ChildProject("examples/valid_raw_data") project.import_data("output/project") assert os.path.exists("output/project"), "project folder was not created" assert all([ os.path.exists(os.path.join("output/project", f)) for f in ['scripts', 'doc'] ]), "not all folders were successfully created" assert (all([ open(f, "r+").read() == open( os.path.join("output/project/", f.replace("examples/valid_raw_data/", ""))).read() for f in glob.glob("examples/valid_raw_data/**.*") ])), "not all files were successfully copied"
def test_whitelist(): project = ChildProject("examples/valid_raw_data") project.read() recordings = project.get_recordings_from_list( Pipeline.recordings_from_list(["sound.wav"])) assert recordings["recording_filename"].tolist() == ["sound.wav"] recordings = project.get_recordings_from_list( Pipeline.recordings_from_list(pd.Series(["sound.wav"]))) assert recordings["recording_filename"].tolist() == ["sound.wav"] recordings = project.get_recordings_from_list( Pipeline.recordings_from_list( pd.DataFrame({"recording_filename": ["sound.wav"]}))) assert recordings["recording_filename"].tolist() == ["sound.wav"] recordings = pd.DataFrame({ "recording_filename": ["sound.wav"] }).to_csv("output/filter.csv") recordings = project.get_recordings_from_list( Pipeline.recordings_from_list("output/filter.csv")) assert recordings["recording_filename"].tolist() == ["sound.wav"] recordings = pd.DataFrame({ "filename": ["sound.wav"] }).to_csv("output/filter.csv") caught_value_error = False try: recordings = project.get_recordings_from_list( Pipeline.recordings_from_list("output/filter.csv")) except ValueError: caught_value_error = True assert caught_value_error == True recordings = project.get_recordings_from_list( Pipeline.recordings_from_list([ "examples/valid_raw_data/recordings/raw/sound.wav", "examples/valid_raw_data/recordings/raw/sound2.wav", ])) assert recordings["recording_filename"].tolist() == [ "sound.wav", "sound2.wav" ]
def test_invalid_project(): project = ChildProject("examples/invalid_raw_data") errors, warnings = project.validate() expected_errors = [ "examples/invalid_raw_data/metadata/children.csv: child_id '1' appears 2 times in lines [2,3], should appear once", "cannot find recording 'test_1_20200918.mp3' at 'examples/invalid_raw_data/recordings/raw/test_1_20200918.mp3'", "examples/invalid_raw_data/metadata/recordings.csv: 'USB' is not a permitted value for column 'recording_device_type' on line 2, should be any of [lena,usb,olympus,babylogger,unknown]", ] expected_warnings = [ "examples/invalid_raw_data/metadata/recordings.csv: '2' does not pass callable test for column 'noisy_setting' on line 2", "file 'examples/invalid_raw_data/recordings/raw/test_1_2020091.mp3' not indexed.", ] assert sorted(expected_errors) == sorted( errors), "errors do not match expected errors" assert sorted(expected_warnings) == sorted( warnings), "warnings do not match expected warnings"
def project(request): if not os.path.exists("output/annotations"): shutil.copytree(src="examples/valid_raw_data", dst="output/annotations") project = ChildProject("output/annotations") yield project os.remove("output/annotations/metadata/annotations.csv") for raw_annotation in glob.glob("output/annotations/annotations/*.*/converted"): shutil.rmtree(raw_annotation)
def test_enforce_dtypes(): project = ChildProject("examples/valid_raw_data", enforce_dtypes=True) project.read() assert project.recordings["child_id"].dtype.kind == "O" assert project.children["child_id"].dtype.kind == "O" project = ChildProject("examples/valid_raw_data", enforce_dtypes=False) project.read() assert project.recordings["child_id"].dtype.kind == "i" assert project.children["child_id"].dtype.kind == "i"
def test_compute_ages(): project = ChildProject("examples/valid_raw_data") project.read() project.recordings["age"] = project.compute_ages() truth = pd.read_csv("tests/truth/ages.csv").set_index("line") pd.testing.assert_frame_equal(project.recordings[["child_id", "age"]], truth[["child_id", "age"]])
def project(request): if not os.path.exists("output/annotations"): project = ChildProject("examples/valid_raw_data") project.import_data("output/annotations") project = ChildProject("output/annotations") yield project os.remove("output/annotations/metadata/annotations.csv") shutil.rmtree("output/annotations/annotations") os.mkdir("output/annotations/annotations")
def test_convert(): project = ChildProject("examples/valid_raw_data") project.import_data("output/convert") project = ChildProject("output/convert") profile = project.convert_recordings(RecordingProfile(name='test')) recordings = project.recordings converted_recordings = profile.recordings assert np.isclose( 4, project.compute_recordings_duration() ['duration'].sum()), "audio duration equals expected value" assert os.path.exists("output/convert/converted_recordings/test" ), "missing converted recordings folder" assert recordings.shape[0] == converted_recordings.shape[ 0], "conversion table is incomplete" assert all(converted_recordings['success'].tolist() ), "not all recordings were successfully converted" assert all([ os.path.exists( os.path.join("output/convert/converted_recordings/test", f)) for f in converted_recordings['converted_filename'].tolist() ]), "recording files are missing"
import subprocess import os import wave import time import datetime parser = argparse.ArgumentParser(description='') parser.add_argument("--source", help = "path to project", required = True) parser.add_argument("--profile", help = "audio profile to be used", default = "", required = False) parser.add_argument("--mem", help = "slurm jobs memory in GB", default = 30, type = int) parser.add_argument("--batch", default = 8, type = int) parser.add_argument("--overwrite", help = "overwrite rttm if exists", default = False, required = False) parser.add_argument("--recordings", help = "recordings whitelist", default = [], nargs = '+') args = parser.parse_args() project = ChildProject(args.source) errors, warnings = project.validate() if len(errors) > 0: print("validation failed, {} error(s) occured".format(len(errors)), file = sys.stderr) print("proceeding despite errors...") audio_prefix = os.path.join('recordings/converted', args.profile) if args.profile else 'recordings/raw' recordings = project.recordings if args.recordings: recordings = project.get_recordings_from_list(Pipeline.recordings_from_list(args.recordings)) print('selected recordings:') print(recordings)
parser = argparse.ArgumentParser( description= 'split the top 10 minutes of each recording with the highest volubility into 500 ms chunks and upload them to zooniverse' ) parser.add_argument("--source", help="project path", required=True) parser.add_argument("--chunks-destination", help="chunks destination", required=True) parser.add_argument("--set", help="annotation set", default='its') parser.add_argument( "--project-id", help="id of the zooniverse project to upload the chunks to", default='') args = parser.parse_args() project = ChildProject(args.source) project.read() sampler = HighVolubilitySampler(project, annotation_set=args.set, metric='cvc', windows_length=60 * 1000, windows_count=10) sampler.sample() sampler.segments.to_csv('segments.csv') zooniverse = ZooniversePipeline() chunks_path = zooniverse.extract_chunks(path=project.path, destination=args.chunks_destination, keyword='example',
from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager import argparse import os parser = argparse.ArgumentParser( description='import and convert VTC annotations into the project') parser.add_argument("--source", help="project path", required=True) parser.add_argument("--overwrite", help="project path", dest='overwrite', action='store_true') args = parser.parse_args() project = ChildProject(args.source) am = AnnotationManager(project) if args.overwrite: am.remove_set('vtc') input = project.recordings[['filename']] input.rename(columns={'filename': 'recording_filename'}, inplace=True) input = input[input['recording_filename'] != 'NA'] input['set'] = 'vtc' input['time_seek'] = 0 input['range_onset'] = 0 input['range_offset'] = 0 input['raw_filename'] = input['recording_filename'].apply( lambda s: os.path.join('vtc', s + '.rttm')) input['format'] = 'vtc_rttm'
def _load(self): self.project = ChildProject(self.ds.path) self.am = AnnotationManager(self.project) self.am.read()
def run( self, destination: str, segments: str, eaf_type: str, template: str, context_onset: int = 0, context_offset: int = 0, path: str = None, import_speech_from: str = None, **kwargs, ): """generate .eaf templates based on intervals to code. :param path: project path :type path: str :param destination: eaf destination :type destination: str :param segments: path to the input segments dataframe :type segments: str :param eaf_type: eaf-type [random, periodic] :type eaf_type: str :param template: name of the template to use (basic, native, or non-native) :type template: str :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context :type context_onset: int :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context :type context_offset: int """ try: from importlib import resources except ImportError: # TODO: Perhaps add this as a dependency to the resources? import importlib_resources as resources etf_path = "{}.etf".format(template) pfsx_path = "{}.pfsx".format(template) if template in ["basic", "native", "non-native"]: with resources.path("ChildProject.templates", etf_path) as etf: etf_path = str(etf) with resources.path("ChildProject.templates", pfsx_path) as pfsx: pfsx_path = str(pfsx) if not os.path.exists(etf_path): raise Exception("{} cannot be found".format(etf_path)) if not os.path.exists(pfsx_path): raise Exception("{} cannot be found".format(pfsx_path)) print("making the " + eaf_type + " eaf file and csv") segments = pd.read_csv(segments) assert_dataframe("segments", segments, not_empty=True) assert_columns_presence( "segments", segments, {"recording_filename", "segment_onset", "segment_offset"}, ) imported_set = None prefill = path and import_speech_from if prefill: project = ChildProject(path) am = AnnotationManager(project) am.read() imported_set = import_speech_from for recording_filename, segs in segments.groupby("recording_filename"): recording_prefix = os.path.splitext(recording_filename)[0] output_filename = (recording_prefix + "_" + eaf_type + "_" + os.path.basename(template)) # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here. timestamps = [(on, off) for on, off in segs.loc[:, ["segment_onset", "segment_offset"]].values] speech_segments = None imported_format = None if prefill: ranges = segs.assign( recording_filename=recording_filename).rename( columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) matches = am.get_within_ranges(ranges, [import_speech_from], 'warn') if len(matches) == 0: continue speech_segments = am.get_segments(matches) try: matches = matches["format"].drop_duplicates() if len(matches.index) == 1: imported_format = matches.iloc[0] except KeyError: imported_format = None output_dir = os.path.join(destination, recording_prefix) create_eaf( etf_path, output_filename, output_dir, recording_filename, timestamps, eaf_type, context_onset, context_offset, template, speech_segments, imported_set, imported_format, ) shutil.copy( pfsx_path, os.path.join(output_dir, "{}.pfsx".format(output_filename)))
def extract_chunks(self, destination, path, annotation_set='vtc', batch_size=1000, target_speaker_type='CHI', sample_size=500, chunk_length=500, threads=0, batches=0, **kwargs): assert 1000 % chunk_length == 0, 'chunk_length should divide 1000' self.destination = destination self.project = ChildProject(path) batch_size = int(batch_size) sample_size = int(sample_size) chunk_length = int(chunk_length) threads = int(threads) self.sample_size = sample_size self.chunk_length = chunk_length am = AnnotationManager(self.project) self.annotations = am.annotations self.annotations = self.annotations[self.annotations['set'] == annotation_set] self.segments = am.get_segments(self.annotations) self.segments = self.segments[self.segments['speaker_type'] == target_speaker_type] self.segments['segment_onset'] = self.segments[ 'segment_onset'] + self.segments['time_seek'] self.segments['segment_offset'] = self.segments[ 'segment_offset'] + self.segments['time_seek'] destination_path = os.path.join(destination, 'chunks') os.makedirs(destination_path, exist_ok=True) if os.listdir(destination_path): raise ValueError( "destination '{}' is not empty, please choose another destination." .format(destination_path)) segments = [] for _recording, _segments in self.segments.groupby( 'recording_filename'): segments.append(_segments.assign(recording_filename=_recording)) pool = mp.Pool(threads if threads > 0 else mp.cpu_count()) self.chunks = pool.map(self.split_recording, segments) self.chunks = itertools.chain.from_iterable(self.chunks) self.chunks = pd.DataFrame([{ 'recording': c.recording, 'onset': c.onset, 'offset': c.offset, 'wav': c.getbasename('wav'), 'mp3': c.getbasename('mp3'), 'speaker_type': target_speaker_type, 'date_extracted': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'uploaded': False, 'project_slug': '', 'subject_set': '', 'zooniverse_id': 0 } for c in self.chunks]) # shuffle chunks so that they can't be joined back together # based on Zooniverse subject IDs self.chunks = self.chunks.sample(frac=1).reset_index(drop=True) self.chunks['batch'] = self.chunks.index.map( lambda x: int(x / batch_size)) self.chunks.index.name = 'index' self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv'))
from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager from ChildProject.metrics import gamma, segments_to_grid, grid_to_vector, vectors_to_annotation_task import argparse parser = argparse.ArgumentParser( description= 'compute agreement measures for all given annotators over a whole dataset') parser.add_argument('path', help='path to the dataset') parser.add_argument('--sets', nargs='+', help='sets to include') args = parser.parse_args() speakers = ['CHI', 'OCH', 'FEM', 'MAL'] project = ChildProject(args.path) am = AnnotationManager(project) am.read() intersection = AnnotationManager.intersection(am.annotations, args.sets) segments = am.get_collapsed_segments(intersection) segments = segments[segments['speaker_type'].isin(speakers)] vectors = [ grid_to_vector( segments_to_grid(segments[segments['set'] == s], 0, segments['segment_offset'].max(), 100, 'speaker_type',