def align(self,reference,collapse_functions=None,replace=True): aligned_output={} for sequence_name in self.computational_sequences.keys(): aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True) refseq=self.computational_sequences[reference].data #unifying the dataset, removing any entries that are not in the reference computational sequence self.unify() #building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary log.status("Pre-alignment based on <%s> computational sequence started ..."%reference) relevant_entries=self.__get_relevant_entries(reference) log.status("Alignment starting ...") pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False) pbar.set_description("Overall Progress") for entry_key in list(refseq.keys()): pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False) pbar_small.set_description("Aligning %s"%entry_key) for i in range(refseq[entry_key]['intervals'].shape[0]): #interval for the reference sequence ref_time=refseq[entry_key]['intervals'][i,:] #we drop zero or very small sequence lengths - no align for those if (abs(ref_time[0]-ref_time[1])<epsilon): pbar_small.update(1) continue #aligning all sequences (including ref sequence) to ref sequence for otherseq_key in list(self.computational_sequences.keys()): if otherseq_key != reference: intersects,intersects_features=self.__intersect_and_copy(ref_time,relevant_entries[otherseq_key][entry_key],epsilon) else: intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:] #there were no intersections between reference and subject computational sequences for the entry if intersects.shape[0] == 0: continue #collapsing according to the provided functions if type(collapse_functions) is list: intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions) if(intersects.shape[0]!=intersects_features.shape[0]): log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference),error=True) aligned_output[otherseq_key][entry_key+"[%d]"%i]={} aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features pbar_small.update(1) pbar_small.close() pbar.update(1) pbar.close() log.success("Alignment to <%s> complete."%reference) if replace is True: log.status("Replacing dataset content with aligned computational sequences") self.__set_computational_sequences(aligned_output) return None else: log.status("Creating new dataset with aligned computational sequences") newdataset=mmdataset({}) newdataset.__set_computational_sequences(aligned_output,metadata_copy=False) return newdataset
def hard_unify(self,active=True): log.status("Hard unify was called ...") all_vidids={} violators=[] all_keys={} for seq_key in list(self.computational_sequences.keys()): all_keys[seq_key]=[vidid for vidid in self.computational_sequences[seq_key].data.keys()] valids=set.intersection(*[set(all_keys[x]) for x in all_keys]) for seq_key in list(self.computational_sequences.keys()): hard_unify_compatible=all(["[" in vidid for vidid in self.computational_sequences[seq_key].data.keys()]) if hard_unify_compatible is False: log.error("Hard unify can only be done on aligned computational sequences, %s violated this ... Exiting ..."%seq_key) violators=set([vidid for vidid in self.computational_sequences[seq_key].data.keys()])-valids for violator in violators: if active==True: log.error("%s entry is not shared among all sequences, removing it ..."%violator,error=False) self[seq_key]._remove_id(violator,purge=False) if active==False and len(violators)>0: log.error("%d violators remain, alignment will fail if called ..."%len(violators),error=True) log.success("Hard unify completed ...")
def validateMetadataIntegrity(metadata, rootName, which=True): log.status( "Checking the integrity of the metadata in <%s> computational sequence ..." % rootName) failure = False if type(metadata) is not dict: log.error( "<%s> computational sequence metadata is not key-value pairs!", error=True) presenceFlag = [ mtd in metadata.keys() for mtd in featuresetMetadataTemplate ] #check if all the metadata is set if all(presenceFlag) is False: #which one is not set if which: missings = [ x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag) if y is False ] log.error("Missing metadata in <%s> computational sequence: %s" % (rootName, str(missings)), error=False) failure = True #if failed before if failure: log.error( msgstring= "<%s> computational sequence does not have all the required metadata ..." % rootName, error=True) else: log.success("<%s> computational sequence metadata in correct format" % rootName) return True
def readURL(url, destination): #TODO: replace the split of destination with cross-os compatible operation if os.path.isdir(destination.rsplit('/', 1)[-2]) is False: os.mkdir(destination.rsplit('/', 1)[-2]) if destination is None: log.error("Destination is not specified when downloading data", error=True) # if(os.path.isfile(destination)): # log.error("%s file already exists ..."%destination,error=True) if os.path.isfile(destination): log.success("File already downloaded, use the old file") else: r = requests.get(url, stream=True) if r.status_code != 200: log.error('URL: %s does not exist' % url, error=True) # Total size in bytes. total_size = int(r.headers.get('content-length', 0)) block_size = 1024 wrote = 0 with open(destination, 'wb') as f: log.status("Downloading from %s to %s..." % (url, destination)) for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size // block_size), unit='KB', unit_scale=True, leave=False): wrote = wrote + len(data) f.write(data) f.close() if total_size != 0 and wrote != total_size: log.error("Error downloading the data ...") log.success("Download complete!") return True
def __init__(self, recipe, destination=None): self.computational_sequences = {} if type(recipe) is str: if os.path.isdir(recipe) is False: log.error("Dataset folder does not exist ...", error=True) from os import listdir from os.path import isfile, join computational_sequence_list = [ f for f in listdir(recipe) if isfile(join(recipe, f)) and f[-4:] == '.csd' ] for computational_sequence_fname in computational_sequence_list: this_sequence = computational_sequence( join(recipe, computational_sequence_fname)) self.computational_sequences[ this_sequence.metadata["root name"]] = this_sequence if type(recipe) is dict: for entry, address in recipe.items(): self.computational_sequences[entry] = computational_sequence( address, destination) if len(self.computational_sequences.keys()) == 0: log.error("Dataset failed to initialize ...", error=True) log.success("Dataset initialized successfully ... ")
def impute(self, ref_key, imputation_fn=numpy.zeros): log.status("Imputation called ...") other_keys = list(self.keys()) other_keys.remove(ref_key) other_keys_dims = { x: list(self[x][self[x].keys()[0]]["features"].shape[1:]) for x in other_keys } pbar = tqdm(total=len(self[ref_key].keys()), unit=" Reference Computational Sequence Entries", leave=False) pbar.set_description("Imputation Progress") for seg_key in self[ref_key].keys(): for other_key in other_keys: try: self[other_key][seg_key] except: self[other_key][seg_key] = { "intervals": self[ref_key][seg_key]["intervals"], "features": imputation_fn([1] + other_keys_dims[other_key]) } pbar.update(1) pbar.close() log.success("Imputation completed ...")
def initBlank(resource): data = {} metadata = {} metadata["root name"] = resource log.success("Initialized empty <%s> computational sequence." % metadata["root name"]) return None, data, metadata
def validate_metadata_format(metadata, root_name, verbose=True): log.status( "Checking the format of the metadata in <%s> computational sequence ..." % root_name) failure = False if type(metadata) is not dict: log.error( "<%s> computational sequence metadata is not key-value pairs!", error=True) presenceFlag = [ mtd in metadata.keys() for mtd in featuresetMetadataTemplate ] #check if all the metadata is set if all(presenceFlag) is False: #verbose one is not set if verbose: missings = [ x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag) if y is False ] log.error("Missing metadata in <%s> computational sequence: %s" % (root_name, str(missings)), error=False) failure = True if failure: log.error( msgstring= "<%s> computational sequence does not have all the required metadata ... continuing " % root_name, error=False) return False else: log.success("<%s> computational sequence metadata in correct format." % root_name) return True
def __unify_dataset(self, active=True): log.status("Unify was called ...") all_vidids = {} violators = [] for seq_key in list(self.computational_sequences.keys()): for vidid in list( self.computational_sequences[seq_key].data.keys()): vidid = vidid.split('[')[0] all_vidids[vidid] = True for vidid in list(all_vidids.keys()): for seq_key in list(self.computational_sequences.keys()): if not any([ vidid_in_seq for vidid_in_seq in self.computational_sequences[seq_key].data.keys() if vidid_in_seq[:len(vidid)] == vidid ]): violators.append(vidid) if len(violators) > 0: for violator in violators: log.error( "%s entry is not shared among all sequences, removing it ..." % violator, error=False) if active == True: self.__remove_id(violator) if active == False and len(violators) > 0: log.error( "%d violators remain, alignment will fail if called ..." % len(violators), error=True) log.success("Unify finished, dataset is compatible for alignment ...")
def revert(self, replace=True): reverted_dataset = {x: {} for x in self.keys()} log.status("Revert was called ...") if len(self.keys()) == 0: log.error( "The dataset contains no computational sequences ... Exiting!", error=True) self.unify() all_keys = self[self.keys()[0]].keys() if len(all_keys) == 0: log.error( "No entries in computational sequences or unify found no shared entries ... Exiting!" ) unique_unnumbered_entries = {} for key in all_keys: if key.split('[')[0] not in unique_unnumbered_entries: unique_unnumbered_entries[key.split('[')[0]] = [] unique_unnumbered_entries[key.split('[')[0]].append( int(key.split('[')[1][:-1])) pbar = tqdm(total=len(unique_unnumbered_entries.keys()), unit=" Unique Sequence Entries", leave=False) pbar.set_description("Reversion Progress") for key in unique_unnumbered_entries.keys(): unique_unnumbered_entries[key].sort() for cs_key in reverted_dataset.keys(): intervals = numpy.concatenate([ self[cs_key][str('%s[%d]' % (key, i))]["intervals"] for i in unique_unnumbered_entries[key] ], axis=0) features = numpy.concatenate([ self[cs_key][str('%s[%d]' % (key, i))]["features"] for i in unique_unnumbered_entries[key] ], axis=0) reverted_dataset[cs_key][key] = { "intervals": intervals, "features": features } pbar.update(1) pbar.close() log.success("Reversion completed ...") if replace is True: log.status( "Replacing dataset content with reverted computational sequences" ) self.__set_computational_sequences(reverted_dataset) return None else: log.status( "Creating new dataset with reverted computational sequences") newdataset = mmdataset({}) newdataset.__set_computational_sequences(reverted_dataset, metadata_copy=False) return newdataset
def __initialize_blank(self,root_name): self.main_file=None self.h5handle=None self.root_name=root_name self.data={} self.metadata={} self.metadata["root name"]=root_name log.success("Initialized empty <%s> computational sequence."%self.metadata["root name"])
def align(self,reference,collapse_functions=None,replace=True): aligned_output={} for sequence_name in self.computational_sequences.keys(): aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True) refseq=self.computational_sequences[reference].data #this for loop is for entry_key - for example video id or the identifier of the data entries log.status("Alignment based on <%s> computational sequence started ..."%reference) self.__unify_dataset() pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False) pbar.set_description("Overall Progress") for entry_key in list(refseq.keys()): pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False) pbar_small.set_description("Aligning %s"%entry_key) for i in range(refseq[entry_key]['intervals'].shape[0]): #interval for the reference sequence ref_time=refseq[entry_key]['intervals'][i,:] #we drop zero or very small sequence lengths - no align for those if (abs(ref_time[0]-ref_time[1])<epsilon): pbar_small.update(1) continue #aligning all sequences (including ref sequence) to ref sequence for otherseq_key in list(self.computational_sequences.keys()): if entry_key.split('[')[0] not in self.computational_sequences[otherseq_key]._get_entries_stripped(): log.error("The dataset does not have unified entry ids across computational sequences. Please call intersect first ...") if otherseq_key != reference: intersects,intersects_features=self.__intersect_and_copy(entry_key,ref_time,self.computational_sequences[otherseq_key],epsilon) else: intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:] #there were no intersections between reference and subject computational sequences for the entry if intersects.shape[0] == 0: continue #collapsing according to the provided functions if type(collapse_functions) is list: intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions) if(intersects.shape[0]!=intersects_features.shape[0]): log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference)) aligned_output[otherseq_key][entry_key+"[%d]"%i]={} aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features pbar_small.update(1) pbar_small.close() pbar.update(1) pbar.close() log.success("Alignment to <%s> complete."%reference) if replace is True: log.status("Replacing dataset content with aligned computational sequences") self.__set_computational_sequences(aligned_output) return None else: log.status("Creating new dataset with aligned computational sequences") newdataset=mmdataset({}) newdataset.__set_computational_sequences(aligned_output) return newdataset
def _checkIntegrity(self,error=True): if not hasattr(self,'metadata') or not hasattr(self,'data'): log.error("computational sequence is blank (data or metadata is missing)") log.status("Checking the integrity of the <%s> computational sequence ..."%self.metadata["root name"]) #TODO: hash check not implemented yet datavalid=validateDataIntegrity(self.data,self.metadata["root name"],which=False) metadatavalid=validateMetadataIntegrity(self.metadata,self.metadata["root name"],which=False) if datavalid and metadatavalid: log.success("<%s> computational sequence is valid!"%self.metadata["root name"])
def readCSD(resource,destination=None): if (resource is None): raise log.error("No resource specified for computational sequence!",error=True) try: h5handle=h5py.File('%s'%resource,'r') except: raise log.error("%s resource is not a valid hdf5 computational sequence ..."%resource,error=True) log.success ("Computational sequence read from file %s ..."%resource) return h5handle,dict(h5handle[h5handle.keys()[0]]["data"]),metadataToDict(h5handle[h5handle.keys()[0]]["metadata"])
def validateDataIntegrity(data, rootName, which=True): log.status( "Checking the integrity of the data in <%s> computational sequence ..." % rootName) failure = False if (type(data) is not dict): #this will cause the rest of the pipeline to crash - RuntimeError log.error( "%s computational sequence data is not in heirarchy format ...", error=True) try: #for each video check the shapes of the intervals and features for vid in data.keys(): #check the intervals first - if failure simply show a warning - no exit since we want to identify all the cases if len(data[vid]["intervals"].shape) != 2: if which: log.error( "Video <%s> in <%s> computational sequence has wrong intervals array shape. " % (vid, rootName), error=False) failure = True #check the features next if len(data[vid]["features"].shape) != 2: if which: log.error( "Video <%s> in <%s> computational sequence has wrong features array shape. " % (vid, rootName), error=False) failure = True #if the first dimension of intervals and features doesn't match if data[vid]["features"].shape[0] != data[vid]["intervals"].shape[ 0]: if which: log.error( "Video <%s> in <%s> computational sequence - features and intervals have different first dimensions. " % (vid, rootName), error=False) failure = True #some other thing has happened! - RuntimeError except: if which: log.error( "<%s> computational sequence data itegrity could not be checked. " % rootName, error=True) #failure during intervals and features check if failure: log.error( "<%s> computational sequence data integrity check failed due to inconsistency in intervals and features. " % rootName, error=True) else: log.success("<%s> computational sequence data in correct format." % rootName) return True
def read_CSD(resource,destination=None): if (resource is None): raise log.error("No resource specified for computational sequence!",error=True) if os.path.isfile(resource) is False: log.error("%s file not found, please check the path ..."%resource,error=True) try: h5handle=h5py.File('%s'%resource,'r') except: raise log.error("%s resource is not a valid hdf5 computational sequence format ..."%resource,error=True) log.success ("Computational sequence read from file %s ..."%resource) return h5handle,dict(h5handle[list(h5handle.keys())[0]]["data"]),metadata_to_dict(h5handle[list(h5handle.keys())[0]]["metadata"])
def writeCSD(data,metadata,rootName,destination): #check the data to make sure it is in correct format validateDataIntegrity(data,rootName) validateMetadataIntegrity(metadata,rootName) log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination)) #opening the file writeh5Handle=h5py.File(destination,'w') #creating the root handle rootHandle=writeh5Handle.create_group(rootName) #writing the data dataHandle=rootHandle.create_group("data") pbar = tqdm(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False) for vid in data: vidHandle=dataHandle.create_group(vid) vidHandle.create_dataset("features",data=data[vid]["features"]) vidHandle.create_dataset("intervals",data=data[vid]["intervals"]) pbar.update(1) pbar.close() log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination)) log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination)) #writing the metadata metadataHandle=rootHandle.create_group("metadata") for metadataKey in metadata.keys(): metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str)) cast_operator=unicode if sys.version_info.major is 2 else str metadataHandle[metadataKey][0]=cast_operator(metadata[metadataKey]) writeh5Handle.close() log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination)) log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
def readURL(url,destination): if destination is None: log.error("Destination is not specified when downloading data",error=True) if(os.path.isfile(destination)): log.error("%s file already exists ..."%destination,error=True) r = requests.get(url, stream=True) if r.status_code != 200: log.error('URL: %s does not exist'%url,error=True) # Total size in bytes. total_size = int(r.headers.get('content-length', 0)); block_size = 1024 wrote = 0 with open(destination, 'wb') as f: log.status("Downloading from %s to %s..."%(url,destination)) for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True): wrote = wrote + len(data) f.write(data) f.close() if total_size != 0 and wrote != total_size: log.error("Error downloading the data ...") log.success("Download complete!") return True
def read_URL(url, destination): if destination is None: log.error("Destination is not specified when downloading data", error=True) if os.path.isdir(destination.rsplit(os.sep, 1)[-2]) is False: os.mkdir(destination.rsplit(os.sep, 1)[-2]) if (os.path.isfile(destination)): log.error("%s file already exists ..." % destination, error=True) r = requests.get(url, stream=True) if r.status_code != 200: log.error('URL: %s does not exist' % url, error=True) # Total size in bytes. total_size = int(r.headers.get('content-length', 0)) block_size = 1024 unit = total_size / block_size wrote = 0 with open(destination, 'wb') as f: log.status("Downloading from %s to %s..." % (url, destination)) pbar = log.progress_bar(total=math.ceil(total_size // block_size), data=r.iter_content(block_size), postfix="Total in kBs", unit='kB', leave=False) for data in pbar: #unit_scale=True, wrote = wrote + len(data) f.write(data) pbar.close() if total_size != 0 and wrote != total_size: log.error("Error downloading the data to %s ..." % destination, error=True) log.success("Download complete!") return True
def unify(self, active=True): log.status("Unify was called ...") all_vidids = {} violators = [] all_keys = {} for seq_key in list(self.computational_sequences.keys()): all_keys[seq_key] = [ vidid.split("[")[0] for vidid in self.computational_sequences[seq_key].data.keys() ] valids = set.intersection(*[set(all_keys[x]) for x in all_keys]) violators = set() for seq_key in list(self.computational_sequences.keys()): violators = violators.union( set([ vidid.split("[")[0] for vidid in self.computational_sequences[seq_key].data.keys() ]) - valids) if len(violators) > 0: for violator in violators: log.error( "%s entry is not shared among all sequences, removing it ..." % violator, error=False) if active == True: self.remove_id(violator, purge=True) if active == False and len(violators) > 0: log.error( "%d violators remain, alignment will fail if called ..." % len(violators), error=True) log.success("Unify completed ...")
def write_CSD(data,metadata,rootName,destination,compression,compression_opts,full_chunk_shape): log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination)) if compression is not None: log.advise("Compression with %s and opts -%d"%(compression,compression_opts)) #opening the file writeh5Handle=h5py.File(destination,'w') #creating the root handle rootHandle=writeh5Handle.create_group(rootName) #writing the data dataHandle=rootHandle.create_group("data") pbar = log.progress_bar(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False) for vid in data: vidHandle=dataHandle.create_group(vid) if compression is not None: vidHandle.create_dataset("features",data=data[vid]["features"],compression=compression,compression_opts=compression_opts) vidHandle.create_dataset("intervals",data=data[vid]["intervals"],compression=compression,compression_opts=compression_opts) else: vidHandle.create_dataset("features",data=data[vid]["features"]) vidHandle.create_dataset("intervals",data=data[vid]["intervals"]) pbar.update(1) pbar.close() log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination)) log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination)) #writing the metadata metadataHandle=rootHandle.create_group("metadata") for metadataKey in metadata.keys(): metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str)) cast_operator=unicode if sys.version_info.major is 2 else str metadataHandle[metadataKey][0]=cast_operator(json.dumps(metadata[metadataKey])) writeh5Handle.close() log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination)) log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
import mmsdk from mmsdk import mmdatasdk from mmsdk.mmdatasdk import log import ChallengeHML20 import numpy def download_data(keys): print("You only need to download the data once!") cmumosei_challenge_acl20 = {} for key in keys: cmumosei_challenge_acl20[key] = mmdatasdk.mmdataset( ChallengeHML20.challenge20_data[key], 'cmumosei_%s/' % key) cmumosei_challenge_acl20 return cmumosei_challenge_acl20 if __name__ == "__main__": #download_data(["raw"]) #to download everything, uncomment the following. The highlevel and raw features should be enough for the challenge. cmumosei_challenge_20 = download_data( list(ChallengeHML20.challenge20_data.keys())) log.success("Dataset downloaded")
try: from mmsdk import mmdatasdk from mmsdk.mmdatasdk import log log.success("CMU Multimodal SDK found!") except: print( "SDK Not Found, Check Your PYTHONPATH? Did you install SDK correctly?") exit(-1) try: import ChallengeHML20 log.success("Challenge Helpers Found!") except: print("ACL Challenge Helpers Not Found, Check Your PYTHONPATH?") exit(-2) log.success( "Welcome to ACL20 Challenge-HML CMU-MOSEI Subchallenge. SDK Works!")
def align(self,reference,replace=True): aligned_output={} for sequence_name in self.computational_sequences.keys(): aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence %s does not exist in dataset"%reference,error=True) refseq=self.computational_sequences[reference].data #this for loop is for entry_key - for example video id or the identifier of the data entries log.status("Alignment based on %s computational sequence started ..."%reference) pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries") pbar.set_description("Overall Progress") for entry_key in list(refseq.keys()): pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0]) pbar_small.set_description("Aligning %s"%entry_key) #intervals for the reference sequence for i in range(refseq[entry_key]['intervals'].shape[0]): ref_time=refseq[entry_key]['intervals'][i,:] if (abs(ref_time[0]-ref_time[1])<epsilon): pbar_small.update(1) continue #aligning all sequences (including ref sequence) to ref sequence for otherseq_key in list(self.computational_sequences.keys()): otherseq=self.computational_sequences[otherseq_key].data[entry_key] #list to contain intersection for (otherseq_key,i) list_intervals=[] list_features=[] #checking all intervals of the otherseq for intersection for j in range(otherseq["intervals"].shape[0]): sub_time=otherseq["intervals"][j] this_features=otherseq["features"][j,:] intersect,intersect_start,intersect_end=self.__intersect(ref_time,sub_time) if intersect == True: list_intervals.append([intersect_start,intersect_end]) list_features.append(this_features) aligned_output[otherseq_key][entry_key+"[%d]"%i]={} aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=numpy.array(list_intervals,dtype='float32') aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=numpy.array(list_features,dtype='float32') if (len(aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape)!=2): print ("F**k") print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape) print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"].shape) print (ref_time,i) print (refseq[entry_key]['features'][i,:].shape) time.sleep(10) pbar_small.update(1) pbar_small.visible=False pbar_small.close() pbar.update(1) pbar.visible=False pbar.close() log.success("Alignment to %s done."%reference) if replace is True: log.status("Replacing dataset content with aligned computational sequences") self.__set_computational_sequences(aligned_output) return None else: log.status("Creating new dataset with aligned computational sequences") newdataset=mmdataset({}) newdataset.__set_computational_sequences(aligned_output) return newdataset print()
#writing the final aligned to disk deploy(cmumosei_challenge_acl20["highlevel"], "final_aligned") #reading from the disk - if the above process is done. #cmumosei_challenge_acl20["highlevel"]=mmdatasdk.mmdataset("final_aligned") #getting the final tensors for machine learning - pass the folds to this function to get data based on tr,va,te folds. tensors = cmumosei_challenge_acl20["highlevel"].get_tensors( seq_len=50, non_sequences=["Emotion Labels"], direction=False, folds=[ mmdatasdk.cmu_mosei.standard_folds.standard_train_fold, mmdatasdk.cmu_mosei.standard_folds.standard_valid_fold, mmdatasdk.cmu_mosei.standard_folds.standard_test_fold ]) fold_names = ["train", "valid", "test"] for i in range(3): #output the shape of the tensors for csd in list(cmumosei_challenge_acl20["highlevel"].keys()): print("Shape of the %s computational sequence for %s fold is %s" % (csd, fold_names[i], tensors[i][csd].shape)) if __name__ == "__main__": cmumosei_challenge_20 = process_data() log.success("Dataset processed")