def merge_annotateddatasets(cfg,project_path,trainingsetfolder_full,windows2linux, video_set=None): """ Merges all the h5 files for all labeled-datasets (from individual videos). This is a bit of a mess because of cross platform compatablity. Within platform comp. is straightforward. But if someone labels on windows and wants to train on a unix cluster or colab... """ AnnotationData=None data_path = Path(os.path.join(project_path , 'labeled-data')) if video_set is None: videos = cfg['video_sets'].keys() else: videos = video_set video_names = [Path(i).stem for i in videos] for i in video_names: try: data = pd.read_hdf((str(data_path / Path(i))+'/CollectedData_'+cfg['scorer']+'.h5'),'df_with_missing') if AnnotationData is None: AnnotationData=data else: AnnotationData=pd.concat([AnnotationData, data]) except FileNotFoundError: print((str(data_path / Path(i))+'/CollectedData_'+cfg['scorer']+'.h5'), " not found (perhaps not annotated)") if AnnotationData is None: print("Annotation data was not found by splitting video paths (from config['video_sets']). An alternative route is taken...") AnnotationData=conversioncode.merge_windowsannotationdataONlinuxsystem(cfg) if AnnotationData is None: print("No data was found!") windowspath=False else: windowspath=len((AnnotationData.index[0]).split('\\'))>1 #true if the first element is in windows path format # Let's check if the code is *not* run on windows (Source: #https://stackoverflow.com/questions/1325581/how-do-i-check-if-im-running-on-windows-in-python) # but the paths are in windows format... if os.name != 'nt' and windowspath and not windows2linux: print("It appears that the images were labeled on a Windows system, but you are currently trying to create a training set on a Unix system. \n In this case the paths should be converted. Do you want to proceed with the conversion?") askuser = input("yes/no") else: askuser='******' filename=str(str(trainingsetfolder_full)+'/'+'/CollectedData_'+cfg['scorer']) if windows2linux or askuser=='yes' or askuser=='y' or askuser=='Ja': #convert windows path in pandas array \\ to unix / ! AnnotationData=conversioncode.convertpaths_to_unixstyle(AnnotationData,filename,cfg) print("Annotation data converted to unix format...") else: #store as is AnnotationData.to_hdf(filename+'.h5', key='df_with_missing', mode='w') AnnotationData.to_csv(filename+'.csv') #human readable. return AnnotationData
def merge_annotateddatasets(cfg, trainingsetfolder_full, windows2linux): """ Merges all the h5 files for all labeled-datasets (from individual videos). This is a bit of a mess because of cross platform compatibility. Within platform comp. is straightforward. But if someone labels on windows and wants to train on a unix cluster or colab... """ AnnotationData = [] data_path = Path(os.path.join(cfg["project_path"], "labeled-data")) videos = cfg["video_sets"].keys() for video in videos: _, filename, _ = _robust_path_split(video) file_path = os.path.join(data_path / filename, f'CollectedData_{cfg["scorer"]}.h5') try: data = pd.read_hdf(file_path, "df_with_missing") AnnotationData.append(data) except FileNotFoundError: print( file_path, " not found (perhaps not annotated). If training on cropped data, " "make sure to call `cropimagesandlabels` prior to creating the dataset.", ) if not len(AnnotationData): print( "Annotation data was not found by splitting video paths (from config['video_sets']). An alternative route is taken..." ) AnnotationData = conversioncode.merge_windowsannotationdataONlinuxsystem( cfg) if not len(AnnotationData): print("No data was found!") return AnnotationData = pd.concat(AnnotationData).sort_index() # When concatenating DataFrames with misaligned column labels, # all sorts of reordering may happen (mainly depending on 'sort' and 'join') # Ensure the 'bodyparts' level agrees with the order in the config file. if cfg.get("multianimalproject", False): ( _, uniquebodyparts, multianimalbodyparts, ) = auxfun_multianimal.extractindividualsandbodyparts(cfg) bodyparts = multianimalbodyparts + uniquebodyparts else: bodyparts = cfg["bodyparts"] AnnotationData = AnnotationData.reindex( bodyparts, axis=1, level=AnnotationData.columns.names.index("bodyparts")) # Let's check if the code is *not* run on windows (Source: #https://stackoverflow.com/questions/1325581/how-do-i-check-if-im-running-on-windows-in-python) # but the paths are in windows format... windowspath = "\\" in AnnotationData.index[0] if os.name != "nt" and windowspath and not windows2linux: print( "It appears that the images were labeled on a Windows system, but you are currently trying to create a training set on a Unix system. \n In this case the paths should be converted. Do you want to proceed with the conversion?" ) askuser = input("yes/no") else: askuser = "******" filename = os.path.join(trainingsetfolder_full, f'CollectedData_{cfg["scorer"]}') if (windows2linux or askuser == "yes" or askuser == "y" or askuser == "Ja"): # convert windows path in pandas array \\ to unix / ! AnnotationData = conversioncode.convertpaths_to_unixstyle( AnnotationData, filename) print("Annotation data converted to unix format...") else: # store as is AnnotationData.to_hdf(filename + ".h5", key="df_with_missing", mode="w") AnnotationData.to_csv(filename + ".csv") # human readable. return AnnotationData
def merge_annotateddatasets(cfg, project_path, trainingsetfolder_full, windows2linux): """ Merges all the h5 files for all labeled-datasets (from individual videos). This is a bit of a mess because of cross platform compatablity. Within platform comp. is straightforward. But if someone labels on windows and wants to train on a unix cluster or colab... """ AnnotationData = [] data_path = Path(os.path.join(project_path, 'labeled-data')) videos = cfg['video_sets'].keys() video_names = [Path(i).stem for i in videos] for i in video_names: filename = os.path.join(data_path / i, f'CollectedData_{cfg["scorer"]}.h5') try: data = pd.read_hdf(filename, 'df_with_missing') AnnotationData.append(data) except FileNotFoundError: print(filename, " not found (perhaps not annotated)") if not len(AnnotationData): print( "Annotation data was not found by splitting video paths (from config['video_sets']). An alternative route is taken..." ) AnnotationData = conversioncode.merge_windowsannotationdataONlinuxsystem( cfg) if not len(AnnotationData): print("No data was found!") return AnnotationData = pd.concat(AnnotationData).sort_index() # When concatenating DataFrames with misaligned column labels, # all sorts of reordering may happen (mainly depending on 'sort' and 'join') # Ensure the 'bodyparts' level agrees with the order in the config file. bodyparts = cfg['bodyparts'] AnnotationData = AnnotationData.reindex( bodyparts, axis=1, level=AnnotationData.columns.names.index('bodyparts')) # Let's check if the code is *not* run on windows (Source: #https://stackoverflow.com/questions/1325581/how-do-i-check-if-im-running-on-windows-in-python) # but the paths are in windows format... windowspath = '\\' in AnnotationData.index[0] if os.name != 'nt' and windowspath and not windows2linux: print( "It appears that the images were labeled on a Windows system, but you are currently trying " "to create a training set on a Unix system. \n " "In this case the paths should be converted. Do you want to proceed with the conversion?" ) askuser = input("yes/no") else: askuser = '******' filename = str( str(trainingsetfolder_full) + '/' + '/CollectedData_' + cfg['scorer']) if windows2linux or askuser == 'yes' or askuser == 'y' or askuser == 'Ja': #convert windows path in pandas array \\ to unix / ! AnnotationData = conversioncode.convertpaths_to_unixstyle( AnnotationData, filename, cfg) print("Annotation data converted to unix format...") else: #store as is AnnotationData.to_hdf(filename + '.h5', key='df_with_missing', mode='w') AnnotationData.to_csv(filename + '.csv') #human readable. return AnnotationData