Пример #1
0
    def __unify_dataset(self, active=True):
        log.status("Unify was called ...")

        all_vidids = {}
        violators = []

        for seq_key in list(self.computational_sequences.keys()):
            for vidid in list(
                    self.computational_sequences[seq_key].data.keys()):
                vidid = vidid.split('[')[0]
                all_vidids[vidid] = True

        for vidid in list(all_vidids.keys()):
            for seq_key in list(self.computational_sequences.keys()):
                if not any([
                        vidid_in_seq for vidid_in_seq in
                        self.computational_sequences[seq_key].data.keys()
                        if vidid_in_seq[:len(vidid)] == vidid
                ]):
                    violators.append(vidid)
        if len(violators) > 0:
            for violator in violators:
                log.error(
                    "%s entry is not shared among all sequences, removing it ..."
                    % violator,
                    error=False)
                if active == True:
                    self.__remove_id(violator)
        if active == False and len(violators) > 0:
            log.error(
                "%d violators remain, alignment will fail if called ..." %
                len(violators),
                error=True)

        log.success("Unify finished, dataset is compatible for alignment ...")
Пример #2
0
    def __set_computational_sequences(self,
                                      new_computational_sequences_data,
                                      metadata_copy=True):

        #getting the old metadata from the sequence before replacing it. Even if this is a new computational sequence this will not cause an issue since old_metadat will just be empty
        old_metadata = {
            m: self.computational_sequences[m].metadata
            for m in list(self.computational_sequences.keys())
        }
        self.computational_sequences = {}
        for sequence_name in list(new_computational_sequences_data.keys()):
            self.computational_sequences[
                sequence_name] = computational_sequence(sequence_name)
            self.computational_sequences[sequence_name].setData(
                new_computational_sequences_data[sequence_name], sequence_name)
            if metadata_copy:
                #if there is no metadata for this computational sequences from the previous one or no previous computational sequenece
                if sequence_name not in list(old_metadata.keys()):
                    log.error(
                        "Metadata not available to copy ..., please provide metadata before writing to disk later",
                        error=False)
                self.computational_sequences[sequence_name].setMetadata(
                    old_metadata[sequence_name], sequence_name)
            self.computational_sequences[
                sequence_name].rootName = sequence_name
Пример #3
0
    def __init__(self, recipe, destination=None):
        self.computational_sequences = {}

        if type(recipe) is str:
            if os.path.isdir(recipe) is False:
                log.error("Dataset folder does not exist ...", error=True)

            from os import listdir
            from os.path import isfile, join
            computational_sequence_list = [
                f for f in listdir(recipe)
                if isfile(join(recipe, f)) and f[-4:] == '.csd'
            ]
            for computational_sequence_fname in computational_sequence_list:
                this_sequence = computational_sequence(
                    join(recipe, computational_sequence_fname))
                self.computational_sequences[
                    this_sequence.metadata["root name"]] = this_sequence

        if type(recipe) is dict:
            for entry, address in recipe.items():
                self.computational_sequences[entry] = computational_sequence(
                    address, destination)

        if len(self.computational_sequences.keys()) == 0:
            log.error("Dataset failed to initialize ...", error=True)

        log.success("Dataset initialized successfully ... ")
Пример #4
0
 def add_computational_sequences(self, recipe, destination):
     for entry, address in recipe.items():
         if entry in self.computational_sequences:
             log.error(
                 "Dataset already contains <%s> computational sequence ..."
                 % entry)
         self.computational_sequences[entry] = computational_sequence(
             address, destination)
Пример #5
0
	def _checkIntegrity(self,error=True):
		if not hasattr(self,'metadata') or not hasattr(self,'data'):
			log.error("computational sequence is blank (data or metadata is missing)")
		log.status("Checking the integrity of the <%s> computational sequence ..."%self.metadata["root name"])
		#TODO: hash check not implemented yet
		datavalid=validateDataIntegrity(self.data,self.metadata["root name"],which=False)
		metadatavalid=validateMetadataIntegrity(self.metadata,self.metadata["root name"],which=False)
		if datavalid and metadatavalid:
			log.success("<%s> computational sequence is valid!"%self.metadata["root name"])
Пример #6
0
 def __collapse(self, intervals, features, functions):
     #we simply collapse the intervals to (1,2) matrix
     new_interval = numpy.array([[intervals.min(), intervals.max()]])
     try:
         function = functions[0]
         new_features = function(intervals, features)
         #print(new_features[0,-1])
         if len(new_features.shape) == 1:
             new_features = new_features[None, :]
     except:
         log.error("Cannot collapse given the set of function.", error=True)
     return new_interval, new_features
Пример #7
0
 def deploy(self, destination, filenames):
     if os.path.isdir(destination) is False:
         os.mkdir(destination)
     for seq_key in list(self.computational_sequences.keys()):
         if seq_key not in list(filenames.keys()):
             log.error(
                 "Filename for %s computational sequences not specified" %
                 seq_key)
         filename = filenames[seq_key]
         if filename[:-4] != '.csd':
             filename += '.csd'
         self.computational_sequences[seq_key].deploy(
             os.path.join(destination, filename))
Пример #8
0
def readURL(url, destination):
    #TODO: replace the split of destination with cross-os compatible operation
    if os.path.isdir(destination.rsplit('/', 1)[-2]) is False:
        os.mkdir(destination.rsplit('/', 1)[-2])
    if destination is None:
        log.error("Destination is not specified when downloading data",
                  error=True)
    if (os.path.isfile(destination)):
        log.error("%s file already exists ..." % destination, error=True)
    r = requests.get(url, stream=True)
    if r.status_code != 200:
        log.error('URL: %s does not exist' % url, error=True)
    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024
    wrote = 0
    with open(destination, 'wb') as f:
        log.status("Downloading from %s to %s..." % (url, destination))
        for data in tqdm(r.iter_content(block_size),
                         total=math.ceil(total_size // block_size),
                         unit='KB',
                         unit_scale=True,
                         leave=False):
            wrote = wrote + len(data)
            f.write(data)
    f.close()
    if total_size != 0 and wrote != total_size:
        log.error("Error downloading the data ...")
    log.success("Download complete!")
    return True
Пример #9
0
	def _initialize(self,resource,destination):
		#computational sequence is already initialized
		if hasattr(self,'h5handle'): raise log.error("<%s> computational sequence already initialized ..."%self.metadata["root name"],error=True)
		#initialization type
		optype=None
		#initializing blank - mainFile is where to initialize the data and resource is None since the data comes from nowhere
		if '.csd' not in resource:
			self.mainFile=None
			#self.resource will be None since there is nowhere this was read from - resource being passed to initBlank is the name of root
			self.resource=None
			self.rootName=resource
			return initBlank(self.rootName)
		#reading from url - mainFile is where the data should go and resource is the url
		else:
	#	try:
			if validators.url(resource):
				#user would like to store to the current directory
				if destination is None or destination == '':
					destination=os.path.join('./',resource.split('/')[-1])
				#user has chosen a different directory
				elif '.csd' not in destination:
					destination=os.path.join(destination,resource.split('/')[-1])
				readURL(resource,destination)
				self.mainFile=destination
				self.resource=resource
			else:
				self.mainFile=resource
			return readCSD(self.mainFile)
Пример #10
0
def readCSD(resource, destination=None):

    if (resource is None):
        raise log.error("No resource specified for computational sequence!",
                        error=True)
    if os.path.isfile(resource) is False:
        log.error("%s file not found, please check the path ..." % resource,
                  error=True)
    try:
        h5handle = h5py.File('%s' % resource, 'r')
    except:
        raise log.error(
            "%s resource is not a valid hdf5 computational sequence  ..." %
            resource,
            error=True)
    log.success("Computational sequence read from file %s ..." % resource)
    return h5handle, dict(h5handle[list(
        h5handle.keys())[0]]["data"]), metadataToDict(h5handle[list(
            h5handle.keys())[0]]["metadata"])
Пример #11
0
def validateMetadataIntegrity(metadata, rootName, which=True):
    log.status(
        "Checking the integrity of the metadata in <%s> computational sequence ..."
        % rootName)
    failure = False
    if type(metadata) is not dict:
        log.error(
            "<%s> computational sequence metadata is not key-value pairs!",
            error=True)
    presenceFlag = [
        mtd in metadata.keys() for mtd in featuresetMetadataTemplate
    ]
    #check if all the metadata is set
    if all(presenceFlag) is False:
        #which one is not set
        if which:
            missings = [
                x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag)
                if y is False
            ]
            log.error("Missing metadata in <%s> computational sequence: %s" %
                      (rootName, str(missings)),
                      error=False)
        failure = True
        #if failed before
    if failure:
        log.error(
            msgstring=
            "<%s> computational sequence does not have all the required metadata ..."
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence metadata in correct format" %
                    rootName)
    return True
Пример #12
0
	def bib_citations(self,outfile=None):
		outfile=sys.stdout if outfile is None else outfile
		if self.metadata is None or self.metadata=={}:
			log.error("Metadata is not set for <%s> computational sequence"%self.rootName)
		outfile.write('Computational Sequence <%s> bib: '%self.rootName+self.metadata['featureset bib citation']+'\n\n')
		outfile.write('Dataset <%s> bib: '%self.metadata["dataset name"]+self.metadata['dataset bib citation']+'\n\n')
Пример #13
0
 def __getitem__(self, key):
     if key not in list(self.computational_sequences.keys()):
         log.error("Computational sequence does not exist ...", error=True)
     return self.computational_sequences[key]
Пример #14
0
    def align(self, reference, collapse_functions=None, replace=True):
        aligned_output = {}

        for sequence_name in self.computational_sequences.keys():
            aligned_output[sequence_name] = {}
        if reference not in self.computational_sequences.keys():
            log.error("Computational sequence <%s> does not exist in dataset" %
                      reference,
                      error=True)
        refseq = self.computational_sequences[reference].data
        # unifying the dataset, removing any entries that are not in the reference computational sequence
        self.__unify_dataset()

        # building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary
        log.status(
            "Alignment based on <%s> computational sequence will start shortly ..."
            % reference)
        relevant_entries = self.__get_relevant_entries(reference)

        pbar = tqdm(total=len(refseq.keys()),
                    unit=" Computational Sequence Entries",
                    leave=False)
        pbar.set_description("Overall Progress")
        for entry_key in list(refseq.keys()):
            pbar_small = tqdm(total=refseq[entry_key]['intervals'].shape[0],
                              unit=" Segments",
                              leave=False)
            pbar_small.set_description("Aligning %s" % entry_key)
            for i in range(refseq[entry_key]['intervals'].shape[0]):
                #interval for the reference sequence
                ref_time = refseq[entry_key]['intervals'][i, :]
                #we drop zero or very small sequence lengths - no align for those
                if (abs(ref_time[0] - ref_time[1]) < epsilon):
                    pbar_small.update(1)
                    continue

                #aligning all sequences (including ref sequence) to ref sequence
                for otherseq_key in list(self.computational_sequences.keys()):
                    if otherseq_key != reference:
                        intersects, intersects_features = self.__intersect_and_copy(
                            ref_time,
                            relevant_entries[otherseq_key][entry_key], epsilon)
                    else:
                        intersects, intersects_features = refseq[entry_key][
                            'intervals'][i, :][None, :], refseq[entry_key][
                                'features'][i, :][None, :]
                    #there were no intersections between reference and subject computational sequences for the entry
                    if intersects.shape[0] == 0:
                        continue
                    #collapsing according to the provided functions
                    if type(collapse_functions) is list:

                        intersects, intersects_features = self.__collapse(
                            intersects, intersects_features,
                            collapse_functions)
                    #two following lines were commented due to incompatibility with our task
                    #if(intersects.shape[0]!=intersects_features.shape[0]):
                    #	log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference))
                    aligned_output[otherseq_key][entry_key + "[%d]" % i] = {}
                    aligned_output[otherseq_key][entry_key + "[%d]" %
                                                 i]["intervals"] = intersects
                    aligned_output[otherseq_key][
                        entry_key +
                        "[%d]" % i]["features"] = intersects_features
            pbar_small.update(1)
            pbar_small.close()
            pbar.update(1)
        pbar.close()
        log.success("Alignment to <%s> complete." % reference)
        if replace is True:
            log.status(
                "Replacing dataset content with aligned computational sequences"
            )
            self.__set_computational_sequences(aligned_output)
            return None
        else:
            log.status(
                "Creating new dataset with aligned computational sequences")
            newdataset = mmdataset({})
            newdataset.__set_computational_sequences(aligned_output,
                                                     metadata_copy=False)
            return newdataset
Пример #15
0
def validateDataIntegrity(data, rootName, which=True):
    log.status(
        "Checking the integrity of the data in <%s> computational sequence ..."
        % rootName)

    pbar = tqdm(total=len(data.keys()),
                unit=" Computational Sequence Entries",
                leave=False)
    failure = False
    if (type(data) is not dict):
        #this will cause the rest of the pipeline to crash - RuntimeError
        log.error(
            "%s computational sequence data is not in heirarchy format ...",
            error=True)
    try:
        #for each video check the shapes of the intervals and features
        for vid in data.keys():
            #check the intervals first - if failure simply show a warning - no exit since we want to identify all the cases
            if len(data[vid]["intervals"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong intervals array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #check the features next
            if len(data[vid]["features"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong features array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #if the first dimension of intervals and features doesn't match
            if data[vid]["features"].shape[0] != data[vid]["intervals"].shape[
                    0]:
                if which:
                    log.error(
                        "Video <%s> in <%s> computational sequence - features and intervals have different first dimensions. "
                        % (vid, rootName),
                        error=False)
                failure = True
            pbar.update(1)
    #some other thing has happened! - RuntimeError
    except:
        if which:
            log.error(
                "<%s> computational sequence data itegrity could not be checked. "
                % rootName,
                error=True)
        pbar.close()
    pbar.close()

    #failure during intervals and features check
    if failure:
        log.error(
            "<%s> computational sequence data integrity check failed due to inconsistency in intervals and features. "
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence data in correct format." %
                    rootName)
        return True