def align(self,reference,collapse_functions=None,replace=True):
		aligned_output={}

		for sequence_name in self.computational_sequences.keys():
			aligned_output[sequence_name]={}
		if reference not in self.computational_sequences.keys():
			log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True)
		refseq=self.computational_sequences[reference].data
		#unifying the dataset, removing any entries that are not in the reference computational sequence
		self.unify()

		#building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary
		log.status("Pre-alignment based on <%s> computational sequence started ..."%reference)
		relevant_entries=self.__get_relevant_entries(reference)
		log.status("Alignment starting ...")

		pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False)
		pbar.set_description("Overall Progress")
		for entry_key in list(refseq.keys()):
			pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False)
			pbar_small.set_description("Aligning %s"%entry_key)
			for i in range(refseq[entry_key]['intervals'].shape[0]):
				#interval for the reference sequence
				ref_time=refseq[entry_key]['intervals'][i,:]
				#we drop zero or very small sequence lengths - no align for those
				if (abs(ref_time[0]-ref_time[1])<epsilon):
					pbar_small.update(1)
					continue

				#aligning all sequences (including ref sequence) to ref sequence
				for otherseq_key in list(self.computational_sequences.keys()):
					if otherseq_key != reference:
						intersects,intersects_features=self.__intersect_and_copy(ref_time,relevant_entries[otherseq_key][entry_key],epsilon)
					else:
						intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:]
					#there were no intersections between reference and subject computational sequences for the entry
					if intersects.shape[0] == 0:
						continue
					#collapsing according to the provided functions
					if type(collapse_functions) is list:
						intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions)
					if(intersects.shape[0]!=intersects_features.shape[0]):
						log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference),error=True)
					aligned_output[otherseq_key][entry_key+"[%d]"%i]={}
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features
				pbar_small.update(1)
			pbar_small.close()
			pbar.update(1)
		pbar.close()
		log.success("Alignment to <%s> complete."%reference)
		if replace is True:
			log.status("Replacing dataset content with aligned computational sequences")
			self.__set_computational_sequences(aligned_output)
			return None
		else:
			log.status("Creating new dataset with aligned computational sequences")
			newdataset=mmdataset({})
			newdataset.__set_computational_sequences(aligned_output,metadata_copy=False)
			return newdataset
	def hard_unify(self,active=True):
		log.status("Hard unify was called ...")

		all_vidids={}
		violators=[]
		
		all_keys={}
		for seq_key in list(self.computational_sequences.keys()):
			all_keys[seq_key]=[vidid for vidid in self.computational_sequences[seq_key].data.keys()]
		
		valids=set.intersection(*[set(all_keys[x]) for x in all_keys])
		for seq_key in list(self.computational_sequences.keys()):
			hard_unify_compatible=all(["[" in vidid for vidid in self.computational_sequences[seq_key].data.keys()])
			if hard_unify_compatible is False:
				log.error("Hard unify can only be done on aligned computational sequences, %s violated this ... Exiting ..."%seq_key)
			violators=set([vidid for vidid in self.computational_sequences[seq_key].data.keys()])-valids
			for violator in violators:
				if active==True:
					log.error("%s entry is not shared among all sequences, removing it ..."%violator,error=False)
					self[seq_key]._remove_id(violator,purge=False)

			if active==False and len(violators)>0:
				log.error("%d violators remain, alignment will fail if called ..."%len(violators),error=True)
		
		log.success("Hard unify completed ...")
def validateMetadataIntegrity(metadata, rootName, which=True):
    log.status(
        "Checking the integrity of the metadata in <%s> computational sequence ..."
        % rootName)
    failure = False
    if type(metadata) is not dict:
        log.error(
            "<%s> computational sequence metadata is not key-value pairs!",
            error=True)
    presenceFlag = [
        mtd in metadata.keys() for mtd in featuresetMetadataTemplate
    ]
    #check if all the metadata is set
    if all(presenceFlag) is False:
        #which one is not set
        if which:
            missings = [
                x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag)
                if y is False
            ]
            log.error("Missing metadata in <%s> computational sequence: %s" %
                      (rootName, str(missings)),
                      error=False)
        failure = True
        #if failed before
    if failure:
        log.error(
            msgstring=
            "<%s> computational sequence does not have all the required metadata ..."
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence metadata in correct format" %
                    rootName)
    return True
示例#4
0
def readURL(url, destination):
    #TODO: replace the split of destination with cross-os compatible operation
    if os.path.isdir(destination.rsplit('/', 1)[-2]) is False:
        os.mkdir(destination.rsplit('/', 1)[-2])
    if destination is None:
        log.error("Destination is not specified when downloading data",
                  error=True)
    # if(os.path.isfile(destination)):
    # 	log.error("%s file already exists ..."%destination,error=True)

    if os.path.isfile(destination):
        log.success("File already downloaded, use the old file")
    else:
        r = requests.get(url, stream=True)
        if r.status_code != 200:
            log.error('URL: %s does not exist' % url, error=True)
        # Total size in bytes.
        total_size = int(r.headers.get('content-length', 0))
        block_size = 1024
        wrote = 0
        with open(destination, 'wb') as f:
            log.status("Downloading from %s to %s..." % (url, destination))
            for data in tqdm(r.iter_content(block_size),
                             total=math.ceil(total_size // block_size),
                             unit='KB',
                             unit_scale=True,
                             leave=False):
                wrote = wrote + len(data)
                f.write(data)
        f.close()
        if total_size != 0 and wrote != total_size:
            log.error("Error downloading the data ...")
        log.success("Download complete!")

    return True
示例#5
0
    def __init__(self, recipe, destination=None):
        self.computational_sequences = {}

        if type(recipe) is str:
            if os.path.isdir(recipe) is False:
                log.error("Dataset folder does not exist ...", error=True)

            from os import listdir
            from os.path import isfile, join
            computational_sequence_list = [
                f for f in listdir(recipe)
                if isfile(join(recipe, f)) and f[-4:] == '.csd'
            ]
            for computational_sequence_fname in computational_sequence_list:
                this_sequence = computational_sequence(
                    join(recipe, computational_sequence_fname))
                self.computational_sequences[
                    this_sequence.metadata["root name"]] = this_sequence

        if type(recipe) is dict:
            for entry, address in recipe.items():
                self.computational_sequences[entry] = computational_sequence(
                    address, destination)

        if len(self.computational_sequences.keys()) == 0:
            log.error("Dataset failed to initialize ...", error=True)

        log.success("Dataset initialized successfully ... ")
示例#6
0
 def impute(self, ref_key, imputation_fn=numpy.zeros):
     log.status("Imputation called ...")
     other_keys = list(self.keys())
     other_keys.remove(ref_key)
     other_keys_dims = {
         x: list(self[x][self[x].keys()[0]]["features"].shape[1:])
         for x in other_keys
     }
     pbar = tqdm(total=len(self[ref_key].keys()),
                 unit=" Reference Computational Sequence Entries",
                 leave=False)
     pbar.set_description("Imputation Progress")
     for seg_key in self[ref_key].keys():
         for other_key in other_keys:
             try:
                 self[other_key][seg_key]
             except:
                 self[other_key][seg_key] = {
                     "intervals": self[ref_key][seg_key]["intervals"],
                     "features":
                     imputation_fn([1] + other_keys_dims[other_key])
                 }
         pbar.update(1)
     pbar.close()
     log.success("Imputation completed ...")
示例#7
0
def initBlank(resource):
    data = {}
    metadata = {}
    metadata["root name"] = resource
    log.success("Initialized empty <%s> computational sequence." %
                metadata["root name"])
    return None, data, metadata
def validate_metadata_format(metadata, root_name, verbose=True):
    log.status(
        "Checking the format of the metadata in <%s> computational sequence ..."
        % root_name)
    failure = False
    if type(metadata) is not dict:
        log.error(
            "<%s> computational sequence metadata is not key-value pairs!",
            error=True)
    presenceFlag = [
        mtd in metadata.keys() for mtd in featuresetMetadataTemplate
    ]
    #check if all the metadata is set
    if all(presenceFlag) is False:
        #verbose one is not set
        if verbose:
            missings = [
                x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag)
                if y is False
            ]
            log.error("Missing metadata in <%s> computational sequence: %s" %
                      (root_name, str(missings)),
                      error=False)
        failure = True
    if failure:
        log.error(
            msgstring=
            "<%s> computational sequence does not have all the required metadata ... continuing "
            % root_name,
            error=False)
        return False
    else:
        log.success("<%s> computational sequence metadata in correct format." %
                    root_name)
        return True
示例#9
0
    def __unify_dataset(self, active=True):
        log.status("Unify was called ...")
        all_vidids = {}
        violators = []
        for seq_key in list(self.computational_sequences.keys()):
            for vidid in list(
                    self.computational_sequences[seq_key].data.keys()):
                vidid = vidid.split('[')[0]
                all_vidids[vidid] = True

        for vidid in list(all_vidids.keys()):
            for seq_key in list(self.computational_sequences.keys()):
                if not any([
                        vidid_in_seq for vidid_in_seq in
                        self.computational_sequences[seq_key].data.keys()
                        if vidid_in_seq[:len(vidid)] == vidid
                ]):
                    violators.append(vidid)
        if len(violators) > 0:
            for violator in violators:
                log.error(
                    "%s entry is not shared among all sequences, removing it ..."
                    % violator,
                    error=False)
                if active == True:
                    self.__remove_id(violator)
        if active == False and len(violators) > 0:
            log.error(
                "%d violators remain, alignment will fail if called ..." %
                len(violators),
                error=True)

        log.success("Unify finished, dataset is compatible for alignment ...")
示例#10
0
    def revert(self, replace=True):
        reverted_dataset = {x: {} for x in self.keys()}
        log.status("Revert was called ...")
        if len(self.keys()) == 0:
            log.error(
                "The dataset contains no computational sequences ... Exiting!",
                error=True)
        self.unify()
        all_keys = self[self.keys()[0]].keys()
        if len(all_keys) == 0:
            log.error(
                "No entries in computational sequences or unify found no shared entries ... Exiting!"
            )

        unique_unnumbered_entries = {}

        for key in all_keys:
            if key.split('[')[0] not in unique_unnumbered_entries:
                unique_unnumbered_entries[key.split('[')[0]] = []
            unique_unnumbered_entries[key.split('[')[0]].append(
                int(key.split('[')[1][:-1]))

        pbar = tqdm(total=len(unique_unnumbered_entries.keys()),
                    unit=" Unique Sequence Entries",
                    leave=False)
        pbar.set_description("Reversion Progress")
        for key in unique_unnumbered_entries.keys():
            unique_unnumbered_entries[key].sort()
            for cs_key in reverted_dataset.keys():
                intervals = numpy.concatenate([
                    self[cs_key][str('%s[%d]' % (key, i))]["intervals"]
                    for i in unique_unnumbered_entries[key]
                ],
                                              axis=0)
                features = numpy.concatenate([
                    self[cs_key][str('%s[%d]' % (key, i))]["features"]
                    for i in unique_unnumbered_entries[key]
                ],
                                             axis=0)
                reverted_dataset[cs_key][key] = {
                    "intervals": intervals,
                    "features": features
                }
            pbar.update(1)
        pbar.close()
        log.success("Reversion completed ...")
        if replace is True:
            log.status(
                "Replacing dataset content with reverted computational sequences"
            )
            self.__set_computational_sequences(reverted_dataset)
            return None
        else:
            log.status(
                "Creating new dataset with reverted computational sequences")
            newdataset = mmdataset({})
            newdataset.__set_computational_sequences(reverted_dataset,
                                                     metadata_copy=False)
            return newdataset
示例#11
0
	def __initialize_blank(self,root_name):
		self.main_file=None
		self.h5handle=None
		self.root_name=root_name
		self.data={}
		self.metadata={}
		self.metadata["root name"]=root_name
		log.success("Initialized empty <%s> computational sequence."%self.metadata["root name"])
示例#12
0
	def align(self,reference,collapse_functions=None,replace=True):
		aligned_output={}

		for sequence_name in self.computational_sequences.keys():
			aligned_output[sequence_name]={}
		if reference not in self.computational_sequences.keys():
			log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True)
		refseq=self.computational_sequences[reference].data
		#this for loop is for entry_key - for example video id or the identifier of the data entries
		log.status("Alignment based on <%s> computational sequence started ..."%reference)
		self.__unify_dataset()

		pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False)
		pbar.set_description("Overall Progress")
		for entry_key in list(refseq.keys()):
			pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False)
			pbar_small.set_description("Aligning %s"%entry_key)
			for i in range(refseq[entry_key]['intervals'].shape[0]):
				#interval for the reference sequence
				ref_time=refseq[entry_key]['intervals'][i,:]
				#we drop zero or very small sequence lengths - no align for those
				if (abs(ref_time[0]-ref_time[1])<epsilon):
					pbar_small.update(1)
					continue

				#aligning all sequences (including ref sequence) to ref sequence
				for otherseq_key in list(self.computational_sequences.keys()):
					if entry_key.split('[')[0] not in self.computational_sequences[otherseq_key]._get_entries_stripped():
						log.error("The dataset does not have unified entry ids across computational sequences. Please call intersect first ...")
					if otherseq_key != reference:
						intersects,intersects_features=self.__intersect_and_copy(entry_key,ref_time,self.computational_sequences[otherseq_key],epsilon)
					else:
						intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:]
					#there were no intersections between reference and subject computational sequences for the entry
					if intersects.shape[0] == 0:
						continue
					#collapsing according to the provided functions
					if type(collapse_functions) is list:
						intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions)
					if(intersects.shape[0]!=intersects_features.shape[0]):
						log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference))
					aligned_output[otherseq_key][entry_key+"[%d]"%i]={}
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features
				pbar_small.update(1)
			pbar_small.close()
			pbar.update(1)
		pbar.close()
		log.success("Alignment to <%s> complete."%reference)
		if replace is True:
			log.status("Replacing dataset content with aligned computational sequences")
			self.__set_computational_sequences(aligned_output)
			return None
		else:
			log.status("Creating new dataset with aligned computational sequences")
			newdataset=mmdataset({})
			newdataset.__set_computational_sequences(aligned_output)
			return newdataset
	def _checkIntegrity(self,error=True):
		if not hasattr(self,'metadata') or not hasattr(self,'data'):
			log.error("computational sequence is blank (data or metadata is missing)")
		log.status("Checking the integrity of the <%s> computational sequence ..."%self.metadata["root name"])
		#TODO: hash check not implemented yet
		datavalid=validateDataIntegrity(self.data,self.metadata["root name"],which=False)
		metadatavalid=validateMetadataIntegrity(self.metadata,self.metadata["root name"],which=False)
		if datavalid and metadatavalid:
			log.success("<%s> computational sequence is valid!"%self.metadata["root name"])
示例#14
0
def readCSD(resource,destination=None):

	if (resource is None): raise log.error("No resource specified for computational sequence!",error=True)	
	try:
		h5handle=h5py.File('%s'%resource,'r')
	except: 
		raise log.error("%s resource is not a valid hdf5 computational sequence  ..."%resource,error=True)
	log.success ("Computational sequence read from file %s ..."%resource)
	return h5handle,dict(h5handle[h5handle.keys()[0]]["data"]),metadataToDict(h5handle[h5handle.keys()[0]]["metadata"])
示例#15
0
def validateDataIntegrity(data, rootName, which=True):
    log.status(
        "Checking the integrity of the data in <%s> computational sequence ..."
        % rootName)
    failure = False
    if (type(data) is not dict):
        #this will cause the rest of the pipeline to crash - RuntimeError
        log.error(
            "%s computational sequence data is not in heirarchy format ...",
            error=True)
    try:
        #for each video check the shapes of the intervals and features
        for vid in data.keys():
            #check the intervals first - if failure simply show a warning - no exit since we want to identify all the cases
            if len(data[vid]["intervals"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong intervals array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #check the features next
            if len(data[vid]["features"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong features array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #if the first dimension of intervals and features doesn't match
            if data[vid]["features"].shape[0] != data[vid]["intervals"].shape[
                    0]:
                if which:
                    log.error(
                        "Video <%s> in <%s> computational sequence - features and intervals have different first dimensions. "
                        % (vid, rootName),
                        error=False)
                failure = True
    #some other thing has happened! - RuntimeError
    except:
        if which:
            log.error(
                "<%s> computational sequence data itegrity could not be checked. "
                % rootName,
                error=True)

    #failure during intervals and features check
    if failure:
        log.error(
            "<%s> computational sequence data integrity check failed due to inconsistency in intervals and features. "
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence data in correct format." %
                    rootName)
        return True
示例#16
0
def read_CSD(resource,destination=None):

	if (resource is None): raise log.error("No resource specified for computational sequence!",error=True)	
	if os.path.isfile(resource) is False:
		log.error("%s file not found, please check the path ..."%resource,error=True)	
	try:
		h5handle=h5py.File('%s'%resource,'r')
	except: 
		raise log.error("%s resource is not a valid hdf5 computational sequence format ..."%resource,error=True)
	log.success ("Computational sequence read from file %s ..."%resource)
	return h5handle,dict(h5handle[list(h5handle.keys())[0]]["data"]),metadata_to_dict(h5handle[list(h5handle.keys())[0]]["metadata"])
示例#17
0
def writeCSD(data,metadata,rootName,destination):
	#check the data to make sure it is in correct format
	validateDataIntegrity(data,rootName)
	validateMetadataIntegrity(metadata,rootName)

	log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination))	
	#opening the file
	writeh5Handle=h5py.File(destination,'w')
	#creating the root handle
	rootHandle=writeh5Handle.create_group(rootName)

	#writing the data
	dataHandle=rootHandle.create_group("data")
	pbar = tqdm(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False)
	for vid in data:
		vidHandle=dataHandle.create_group(vid)
		vidHandle.create_dataset("features",data=data[vid]["features"])
		vidHandle.create_dataset("intervals",data=data[vid]["intervals"])
		pbar.update(1)
	pbar.close()
	log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination))
	log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination))
	#writing the metadata
	metadataHandle=rootHandle.create_group("metadata")
	for metadataKey in metadata.keys():
		metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str))
		cast_operator=unicode if sys.version_info.major is 2 else str
		metadataHandle[metadataKey][0]=cast_operator(metadata[metadataKey])

	writeh5Handle.close()
	log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination))
	log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
def readURL(url,destination):
	if destination is None:
		log.error("Destination is not specified when downloading data",error=True)
	if(os.path.isfile(destination)):
		log.error("%s file already exists ..."%destination,error=True)
	r = requests.get(url, stream=True)
	if r.status_code != 200:
		log.error('URL: %s does not exist'%url,error=True) 
	# Total size in bytes.
	total_size = int(r.headers.get('content-length', 0)); 
	block_size = 1024
	wrote = 0 
	with open(destination, 'wb') as f:
		log.status("Downloading from %s to %s..."%(url,destination))
		for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True):
			wrote = wrote  + len(data)
			f.write(data)
	f.close()
	if total_size != 0 and wrote != total_size:
		log.error("Error downloading the data ...")
	log.success("Download complete!")
	return True
def read_URL(url, destination):
    if destination is None:
        log.error("Destination is not specified when downloading data",
                  error=True)

    if os.path.isdir(destination.rsplit(os.sep, 1)[-2]) is False:
        os.mkdir(destination.rsplit(os.sep, 1)[-2])

    if (os.path.isfile(destination)):
        log.error("%s file already exists ..." % destination, error=True)

    r = requests.get(url, stream=True)
    if r.status_code != 200:
        log.error('URL: %s does not exist' % url, error=True)
    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024
    unit = total_size / block_size
    wrote = 0
    with open(destination, 'wb') as f:
        log.status("Downloading from %s to %s..." % (url, destination))
        pbar = log.progress_bar(total=math.ceil(total_size // block_size),
                                data=r.iter_content(block_size),
                                postfix="Total in kBs",
                                unit='kB',
                                leave=False)
        for data in pbar:  #unit_scale=True,
            wrote = wrote + len(data)
            f.write(data)
    pbar.close()

    if total_size != 0 and wrote != total_size:
        log.error("Error downloading the data to %s ..." % destination,
                  error=True)

    log.success("Download complete!")
    return True
示例#20
0
    def unify(self, active=True):
        log.status("Unify was called ...")

        all_vidids = {}
        violators = []

        all_keys = {}
        for seq_key in list(self.computational_sequences.keys()):
            all_keys[seq_key] = [
                vidid.split("[")[0]
                for vidid in self.computational_sequences[seq_key].data.keys()
            ]

        valids = set.intersection(*[set(all_keys[x]) for x in all_keys])
        violators = set()
        for seq_key in list(self.computational_sequences.keys()):
            violators = violators.union(
                set([
                    vidid.split("[")[0] for vidid in
                    self.computational_sequences[seq_key].data.keys()
                ]) - valids)

        if len(violators) > 0:
            for violator in violators:
                log.error(
                    "%s entry is not shared among all sequences, removing it ..."
                    % violator,
                    error=False)
                if active == True:
                    self.remove_id(violator, purge=True)
        if active == False and len(violators) > 0:
            log.error(
                "%d violators remain, alignment will fail if called ..." %
                len(violators),
                error=True)

        log.success("Unify completed ...")
示例#21
0
def write_CSD(data,metadata,rootName,destination,compression,compression_opts,full_chunk_shape):

	log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination))
	if compression is not None:
		log.advise("Compression with %s and opts -%d"%(compression,compression_opts))
	#opening the file
	writeh5Handle=h5py.File(destination,'w')
	#creating the root handle
	rootHandle=writeh5Handle.create_group(rootName)

	#writing the data
	dataHandle=rootHandle.create_group("data")
	pbar = log.progress_bar(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False)
	for vid in data:
		vidHandle=dataHandle.create_group(vid)
		if compression is not None:
			vidHandle.create_dataset("features",data=data[vid]["features"],compression=compression,compression_opts=compression_opts)
			vidHandle.create_dataset("intervals",data=data[vid]["intervals"],compression=compression,compression_opts=compression_opts)
		else:
			vidHandle.create_dataset("features",data=data[vid]["features"])
			vidHandle.create_dataset("intervals",data=data[vid]["intervals"])
			
		pbar.update(1)
	pbar.close()
	log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination))
	log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination))
	#writing the metadata
	metadataHandle=rootHandle.create_group("metadata")
	for metadataKey in metadata.keys():
		metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str))
		cast_operator=unicode if sys.version_info.major is 2 else str
		metadataHandle[metadataKey][0]=cast_operator(json.dumps(metadata[metadataKey]))
	writeh5Handle.close()

	log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination))
	log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
import mmsdk
from mmsdk import mmdatasdk
from mmsdk.mmdatasdk import log
import ChallengeHML20
import numpy


def download_data(keys):
    print("You only need to download the data once!")
    cmumosei_challenge_acl20 = {}
    for key in keys:
        cmumosei_challenge_acl20[key] = mmdatasdk.mmdataset(
            ChallengeHML20.challenge20_data[key], 'cmumosei_%s/' % key)
        cmumosei_challenge_acl20
    return cmumosei_challenge_acl20


if __name__ == "__main__":
    #download_data(["raw"])
    #to download everything, uncomment the following. The highlevel and raw features should be enough for the challenge.
    cmumosei_challenge_20 = download_data(
        list(ChallengeHML20.challenge20_data.keys()))
    log.success("Dataset downloaded")
示例#23
0
try:
    from mmsdk import mmdatasdk
    from mmsdk.mmdatasdk import log
    log.success("CMU Multimodal SDK found!")
except:
    print(
        "SDK Not Found, Check Your PYTHONPATH? Did you install SDK correctly?")
    exit(-1)

try:
    import ChallengeHML20
    log.success("Challenge Helpers Found!")
except:
    print("ACL Challenge Helpers Not Found, Check Your PYTHONPATH?")
    exit(-2)

log.success(
    "Welcome to ACL20 Challenge-HML CMU-MOSEI Subchallenge. SDK Works!")
示例#24
0
	def align(self,reference,replace=True):
		aligned_output={}
		for sequence_name in self.computational_sequences.keys():
			aligned_output[sequence_name]={}
		if reference not in self.computational_sequences.keys():
			log.error("Computational sequence %s does not exist in dataset"%reference,error=True)
		refseq=self.computational_sequences[reference].data
		#this for loop is for entry_key - for example video id or the identifier of the data entries
		log.status("Alignment based on %s computational sequence started ..."%reference)
		pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries")
		pbar.set_description("Overall Progress")
		for entry_key in list(refseq.keys()):
			pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0])
			pbar_small.set_description("Aligning %s"%entry_key)
			#intervals for the reference sequence
			for i in range(refseq[entry_key]['intervals'].shape[0]):
				ref_time=refseq[entry_key]['intervals'][i,:]
				if (abs(ref_time[0]-ref_time[1])<epsilon):
					pbar_small.update(1)
					continue
				#aligning all sequences (including ref sequence) to ref sequence
				for otherseq_key in list(self.computational_sequences.keys()):
					otherseq=self.computational_sequences[otherseq_key].data[entry_key]
					#list to contain intersection for (otherseq_key,i)
					list_intervals=[]
					list_features=[]
					#checking all intervals of the otherseq for intersection
					for j in range(otherseq["intervals"].shape[0]):
						sub_time=otherseq["intervals"][j]
						this_features=otherseq["features"][j,:]
						intersect,intersect_start,intersect_end=self.__intersect(ref_time,sub_time)
						if intersect == True:
							list_intervals.append([intersect_start,intersect_end])
							list_features.append(this_features)
					
					aligned_output[otherseq_key][entry_key+"[%d]"%i]={}
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=numpy.array(list_intervals,dtype='float32')
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=numpy.array(list_features,dtype='float32')
					if (len(aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape)!=2):
						print ("F**k")
						print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape)
						print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"].shape)
						print (ref_time,i)
						print (refseq[entry_key]['features'][i,:].shape)
						time.sleep(10)
				pbar_small.update(1)
			pbar_small.visible=False
			pbar_small.close()
			pbar.update(1)
		pbar.visible=False
		pbar.close()
		log.success("Alignment to %s done."%reference)
		if replace is True:
			log.status("Replacing dataset content with aligned computational sequences")
			self.__set_computational_sequences(aligned_output)
			return None
		else:
			log.status("Creating new dataset with aligned computational sequences")
			newdataset=mmdataset({})
			newdataset.__set_computational_sequences(aligned_output)
			return newdataset	
		print()
    #writing the final aligned to disk
    deploy(cmumosei_challenge_acl20["highlevel"], "final_aligned")

    #reading from the disk - if the above process is done.
    #cmumosei_challenge_acl20["highlevel"]=mmdatasdk.mmdataset("final_aligned")

    #getting the final tensors for machine learning - pass the folds to this function to get data based on tr,va,te folds.
    tensors = cmumosei_challenge_acl20["highlevel"].get_tensors(
        seq_len=50,
        non_sequences=["Emotion Labels"],
        direction=False,
        folds=[
            mmdatasdk.cmu_mosei.standard_folds.standard_train_fold,
            mmdatasdk.cmu_mosei.standard_folds.standard_valid_fold,
            mmdatasdk.cmu_mosei.standard_folds.standard_test_fold
        ])

    fold_names = ["train", "valid", "test"]

    for i in range(3):
        #output the shape of the tensors
        for csd in list(cmumosei_challenge_acl20["highlevel"].keys()):
            print("Shape of the %s computational sequence for %s fold is %s" %
                  (csd, fold_names[i], tensors[i][csd].shape))


if __name__ == "__main__":
    cmumosei_challenge_20 = process_data()
    log.success("Dataset processed")