def get_obsolete(output_dir, data_type):
    _track_list = TrackList.load_from_dir(output_dir)
    return [
        str(track_file.name).split('.')[0]
        for track_file in Path(get_data_dir(output_dir, data_type)).glob('*.pickle')
        if not _track_list.have_track_id(str(track_file.name).split('.')[0])
    ]
def get_missing(output_dir, data_type):
    _track_list = TrackList.load_from_dir(output_dir)
    return [
        track_id
        for track_id in _track_list.get_track_ids()
        if not have_datapoint(output_dir, data_type, track_id)
    ]
def count(output_dir):
    track_list = TrackList.load_from_dir(output_dir)
    N = len(track_list.track_ids)
    print(f'expecting {N} tracks')
    N_feat = count_data_points(output_dir, AUDIO_FEATURES)
    print(f'found {N_feat} features objects')
    N_ana = count_data_points(output_dir, AUDIO_ANALYSIS)
    print(f'found {N_ana} analysis objects')
def fetch(output_dir):
    spinner = Halo('Fetching tracks', spinner='dots')
    spinner.start()
    _track_list = TrackList.load_from_dir(output_dir)
    track_ids = get_missing(output_dir, AUDIO_ANALYSIS)
    if not exists(get_data_dir(output_dir, AUDIO_ANALYSIS)):
        makedirs(get_data_dir(output_dir, AUDIO_ANALYSIS))
    track_analyses = n_track_analyses_generator(track_ids)
    count = 0 + count_data_points(output_dir, AUDIO_ANALYSIS)
    for track_analysis in track_analyses:
        if 'track_not_found' in track_analysis:
            _track_list.remove_track_id(track_analysis['track_not_found'])
            print(f"removed {track_analysis['track_not_found']} from dataset")
            continue
        count += 1
        extracted = extract_track_analysis(track_analysis)
        spinner.text = f'Fetching tracks ({(count / _track_list.get_desired_tracks_amount()) * 100:.2f}%)'
        store_extracted_analysis(output_dir, extracted)
    spinner.stop()
    _track_list.dump(output_dir)
def collect_data(data_dir,
                 test_split,
                 test_split_index=0,
                 verbose=False,
                 dry=False,
                 subset=10000,
                 min_conf=0):
    track_list = TrackList.load_from_dir(data_dir)
    all_tracks = track_list.get_track_ids()
    N = len(all_tracks)
    if dry:  # use virtually no data at all - just test the program execution
        N = subset

    # Load all data
    if verbose:
        print("Collecting data...")
    data = load_data_dict(data_dir, np.array(all_tracks))

    # Filter by minimum confidence level
    data_minconf = dict()
    for (track_id, track) in data.items():
        if track['key_confidence'] >= min_conf / 100:
            data_minconf[track_id] = track

    # k-fold CV splits
    n = np.min([N, len(data_minconf)])
    chunks = np.array_split(np.arange(n), test_split)
    test_split = chunks[test_split_index]
    train_split = np.concatenate(chunks[:test_split_index] +
                                 chunks[test_split_index + 1:])

    # Split data
    trackids = np.array(list(data_minconf.keys()))
    tracks = np.array(list(data_minconf.values()))
    testing_data = dict(zip(trackids[test_split], tracks[test_split]))
    training_data = dict(zip(trackids[train_split], tracks[train_split]))

    if verbose:
        print("Data collected. [min_conf={}, n={}, N={}]".format(
            min_conf, n, N))
    return training_data, testing_data
def list_tracks(mpl_data_path, output_dir, n, list_dir='', _track_list: TrackList = None) -> None:
    spinner = Halo(text='Listing tracks', spinner='dots')
    spinner.start()
    if not exists(get_data_dir(output_dir, AUDIO_FEATURES)):
        makedirs(get_data_dir(output_dir, AUDIO_FEATURES))
    # We will count the amount of tracks per key and mode
    key_counts = dict()
    required_per_key = ceil(n / 24)
    for key in range(24):
        key_counts[key] = 0
    # Start from a provided track list or create a new one
    track_list_complete = False
    if _track_list is not None:
        track_list_complete = True
        track_id_gen = listing_track_id_generator(output_dir, mpl_data_path, _track_list.have_track_id)
        for id in get_datapoint_ids(output_dir, AUDIO_FEATURES):
            f = load_features(output_dir, id)
            key = f['key'] + (f['mode'] * 12)
            key_counts[key] += 1
    else:
        track_id_gen = track_id_generator(mpl_data_path)
        _track_list = TrackList()
        _track_list.set_desired_tracks_amount(n)
    total = count_data_points(output_dir, AUDIO_FEATURES)
    _track_list.dump(output_dir)

    def finished():
        return reduce(operator.and_, [key_counts[key] >= required_per_key for key in range(24)]) or \
               count_data_points(output_dir, AUDIO_FEATURES) >= n
    while not finished():
        have_track = partial(have_datapoint, output_dir, AUDIO_FEATURES)
        track_ids = get_n_track_ids(track_id_gen, 100, have_track)
        track_feats = n_track_features(track_ids)
        for track_feat in track_feats:
            extracted_track_features = extract_audio_features(track_feat)
            key = extracted_track_features['key'] + (extracted_track_features['mode'] * 12)
            if key_counts[key] < required_per_key:
                key_counts[key] += 1
                if not track_list_complete:
                    _track_list.add_track_id(extracted_track_features['id'])
                store_extracted_features(output_dir, extracted_track_features)
                total += 1
            _track_list.dump(output_dir)
            if finished():
                break
        perc = 100 * (total / n)
        spinner.start(f'Listing tracks ({perc:.2f}%)')
    spinner.stop()
    _track_list.dump(output_dir)
    if list_dir:
        _track_list.dump(list_dir)
def create_track_list(track_ids) -> TrackList:
    _track_list = TrackList()
    _track_list.set_track_ids(track_ids)
    return _track_list
def count(output_dir):
    track_list = TrackList.load_from_dir(output_dir)
    N = len(track_list.track_ids)
    print(f'expecting {N} tracks')
    N_feat = count_data_points(output_dir, AUDIO_FEATURES)
    print(f'found {N_feat} features objects')
    N_ana = count_data_points(output_dir, AUDIO_ANALYSIS)
    print(f'found {N_ana} analysis objects')


if __name__ == '__main__':
    args = get_args()
    if args.command == 'list':
        track_list = None
        if args.use_list:
            track_list = TrackList.load(args.use_list)
            args.N = track_list.get_desired_tracks_amount()
        else:
            output_dir = (Path(getcwd()) / args.output_dir).absolute()
            if len(list(Path(get_data_dir(args.output_dir, AUDIO_FEATURES)).glob('*.pickle'))) > 0:
                print(f'there already some audio features downloaded and stored in the output directory '
                      f'({output_dir}). Either provide the path to a track_list.pickle with --use-list or make '
                      f'sure that {output_dir} is empty')
                exit()
        list_tracks(args.mpl_dir, args.output_dir, args.N, args.list_dir, track_list)
    elif args.command == 'fetch':
        fetch(args.output_dir)
    elif args.command == 'check':
        check(args.output_dir)
    elif args.command == 'count':
        count(args.output_dir)