Exemplo n.º 1
0
    def simsearch_SAX(self, ts):
        if isinstance(ts,TimeSeries):
            ts = [ts.time,ts.data]
        x1 = np.linspace(min(ts[0]),max(ts[0]), self.tslen_SAX)
        ts_SAX_data = interp1d(ts[0], ts[1])(x1)
        ts_SAX_time = x1
        ts_SAX = TimeSeries(ts_SAX_time,ts_SAX_data)
        rep = isax_indb(ts_SAX,self.card,self.wordlength)
        n = self.SAX_tree.search(rep)
        closestpk = None
        pkdist = None
        if n.ts:
            for pk in n.ts:
                thisdist = self.dist(ts_SAX, self.rows_SAX[pk]['ts'])
                if pkdist is None or thisdist < pkdist:
                    closestpk = pk
                    pkdist = thisdist
        else:
            n = self.SAX_tree.search2(rep)
            for pk in n.ts:
                thisdist = self.dist(ts_SAX, self.rows_SAX[pk]['ts'])
                if pkdist is None or thisdist < pkdist:
                    closestpk = pk
                    pkdist = thisdist

        return closestpk
Exemplo n.º 2
0
 def delete_ts(self, pk):    
     if pk in self.rows:
         for field in self.rows[pk]:
             if field[:5] == 'd_vp-':
                 if field[5:] != pk:
                     self.indexes[field].delete(self.rows[pk][field], pk)
             elif self.schema[field]['index'] is not None:
                 if self.schema[field]['type'] in [int, float]:
                     self.indexes[field].delete(self.rows[pk][field], pk)
                 else:
                     self.indexes[field][self.rows[pk][field]].remove(pk)  
             if field == 'vp' and self.rows[pk]['vp'] == True:
                 self.del_vp(pk)
         del self.rows[pk]
         fd = open(self.dbname, 'a')
         fd.write(pk+':DELETE:0\n')
         fd.close()
     if pk in self.rows_SAX:
         rep = isax_indb(self.rows_SAX[pk]['ts'],self.card,self.wordlength)
         self.SAX_tree.delete(rep,pk)
         del self.rows_SAX[pk]
Exemplo n.º 3
0
    def __init__(self, schema, pkfield, load=False, dbname="db", overwrite=False, dist=procs.corr_indb, threshold = 10, wordlength = 16, tslen = 256, cardinality = 64):
        """
        Parameters
        ----------
        schema : dict
            Key = name of field (e.g. 'ts', 'mean')
            Value = dict of that field's properties.  Recognized keys include:
                'type': Required for all fields except ts.  pkfield must have type str.
                'index': Required for all fields.  
        pkfield : str
            The name of the field which will be the primary key.  Must match a key in schema.
        load : bool
            Whether to populate the database with an existing one on file.
        dbname : str
            Database filename
        overwrite : bool
            If load=False, whether to overwrite an existing database.
        dist : function
            Calculates the distance between two TimeSeries objects, must take arguments (ts1, ts2)
        Attributes
        ----------
        indexes : dict
            Key = fieldname
            Value = binary search tree (if int or float) or dictionary of sets (otherwise) mapping values to pks
        rows : dict
            Key = primary key
            Value = dict of the fields associated with each key
        schema : dict (See above)
        pkfield : str (See above)
        dbname : str (See above)
        tslen : int
            The length of each timeseries in the database, strictly enforced
        """
        # ---- Validating input ---- #
        if not isinstance(pkfield, str):
            raise ValueError("Field name must be of type str")
        if not isinstance(threshold, int):
            raise ValueError("Threshold must be of type int")
        if not isinstance(wordlength, int):
            raise ValueError("Word length must be of type int")
        if threshold <= 0:
            raise ValueError("Threshold must be greater than zero")
        if wordlength <= 0:
            raise ValueError("Word length must be greater than zero")
        if '1' in '{0:b}'.format(wordlength)[1:]:
            raise ValueError("Word length must be a power of two")
        if not isinstance(tslen, int):
            raise ValueError("TimeSeries length must be of type int")
        if tslen < wordlength:
            raise ValueError("TimeSeries length must be greater than or equal to the word length")
        if '1' in '{0:b}'.format(tslen)[1:]:
            raise ValueError("TimeSeries length must be a power of two")
        if not isinstance(cardinality, int):
            raise ValueError("Cardinality must be of type int")
        if cardinality <= 0:
            raise ValueError("Cardinality must be greater than zero")
        if '1' in '{0:b}'.format(cardinality)[1:]:
            raise ValueError("Cardinality must be a power of two")
        if cardinality > 64:
            raise ValueError("Cardinalities greater than 64 are not supported")    
        if not isinstance(load, bool):
            raise ValueError("Load must be of type bool")
        if not isinstance(dbname, str):
            raise ValueError("Database name must be string")
        if not isinstance(overwrite, bool):
            raise ValueError("Overwrite must be of type bool")
        if isinstance(schema, dict):
            for field in schema:
                if field == 'DELETE':
                    raise ValueError("The fieldname 'DELETE' is forbidden")
                if ':' in field:
                    raise ValueError("Field names may not contain the ':' character")
                if field != 'ts':   
                    if 'type' not in schema[field]:
                        raise ValueError("Schema must specify type for each non-ts field")
                    if field == pkfield and schema[field]['type'] != str:
                        raise ValueError("Primary key must be of type str")
                    if schema[field]['type'] not in [int, float, bool, str]:
                        raise ValueError("Only types int, float, bool, and str are supported")
                if field[:5] == 'd_vp-':
                    raise ValueError("Field names beginning with 'd_vp-' are forbidden")
                if field == 'vp' and schema[field]['type'] != bool:
                    raise ValueError("Field 'vp' must be of boolean type")
        else:
            raise ValueError("Schema must be a dictionary")
        if pkfield not in schema:
            raise ValueError("Primary key field must be included in schema")

        # Assign attributes according to schema
        self.indexes = {}
        self.rows = {}
        self.rows_SAX = {}
        self.wordlength = wordlength
        self.threshold = threshold
        self.SAX_tree = Tree_Initializer(threshold = threshold, wordlength = wordlength).tree    
        self.card = cardinality
        self.schema = schema
        self.dbname = dbname
        self.pkfield = pkfield
        self.tslen = None
        self.tslen_SAX = tslen
        self.overwrite = overwrite
        self.dist = dist
        self.vps = []
        for s in schema:
            indexinfo = schema[s]['index']
            if indexinfo is not None:
                if schema[s]['type'] == int or schema[s]['type'] == float:
                    self.indexes[s] = BinarySearchTree()
                else:  # Add a bitmask option for strings?
                    self.indexes[s] = defaultdict(set)

        if load:   
            try:
                fd = open(dbname)
                for l in fd.readlines():
                    [pk, field, val] = l.strip().split(":")
                    if field in self.schema:
                        if pk not in self.rows:
                            self.rows[pk] = {pkfield:pk}
                        else:
                            if self.schema[field]['type'] == bool:
                                if val == 'False': 
                                    self.rows[pk][field] = False
                                else:
                                    self.rows[pk][field] = True
                            else:
                                self.rows[pk][field] = self.schema[field]['type'](val)
                        if pk not in self.rows_SAX:
                            self.rows_SAX[pk] = {pkfield:pk}
                        else:
                            if self.schema[field]['type'] == bool:
                                if val == 'False': 
                                    self.rows_SAX[pk][field] = False
                                else:
                                    self.rows_SAX[pk][field] = True
                            else:
                                self.rows_SAX[pk][field] = self.schema[field]['type'](val)
                        if field == 'vp' and val == 'True':
                            self.vps.append(pk)
                            self.indexes['d_vp-'+pk] = BinarySearchTree()
                    elif field == 'DELETE':
                        if 'vp' in schema and self.rows[pk]['vp'] == True:
                            self.del_vp(pk)
                        del self.rows[pk]
                        del self.rows_SAX[pk]
                    elif field[:5] == 'd_vp-':
                        self.rows[pk][field] = float(val)
                    else:
                        raise IOError("Database is incompatible with input schema")
                fd.close()
                
                # Read in timeseries of non-deleted keys
                for pk in self.rows:
                    tsarray = np.load(self.dbname+"_ts/"+pk+"_ts.npy")
                    self.rows[pk]['ts'] = TimeSeries(tsarray[0,:], tsarray[1,:])
                    self.tslen = tsarray.shape[1]
                    #tsarray2 = np.load(self.dbname+"_ts_SAX/"+pk+"_ts_SAX.npy")
                    x1 = np.linspace(min(tsarray[0,:]),max(tsarray[0,:]), self.tslen_SAX)
                    ts_SAX_data = interp1d(tsarray[0,:], tsarray[1,:])(x1)
                    ts_SAX_time = x1
                    ts_SAX = TimeSeries(ts_SAX_time,ts_SAX_data)
                    self.rows_SAX[pk]['ts'] = ts_SAX
                    rep = isax_indb(ts_SAX,self.card,self.wordlength)
                    self.SAX_tree.insert(pk, rep)
                self.index_bulk(list(self.rows.keys()))
            except:
                raise IOError("Database does not exist or has been corrupted")
        else:
            if os.path.exists(dbname) and overwrite == False:
                raise ValueError("Database of that name already exists. Delete existing db, rename, or set overwrite=True.")
Exemplo n.º 4
0
    def insert_ts(self, pk, ts):    
        try:
            pk = str(pk)
        except:
            raise ValueError("Primary keys must be string-compatible")
        if ':' in pk:
            raise ValueError("Primary keys may not include the ':' character") 
        if not isinstance(ts, TimeSeries):
            raise ValueError('Must insert a TimeSeries object')

        if pk not in self.rows:
            self.rows[pk] = {self.pkfield:pk}
        else:
            raise ValueError('Duplicate primary key found during insert')
        if pk not in self.rows_SAX:
            self.rows_SAX[pk] = {self.pkfield:pk}
        else:
            raise ValueError('Duplicate primary key found during insert')

        # Save timeseries as a 2d numpy array
        if self.tslen is None:
            self.tslen = len(ts)
        elif len(ts) != self.tslen:
            raise ValueError('All timeseries must be of same length')
        if not os.path.exists(self.dbname+"_ts"):
            os.makedirs(self.dbname+"_ts")
        np.save(self.dbname+"_ts/"+pk+"_ts.npy", np.vstack((ts.time, ts.data)))
        
        x1 = np.linspace(min(ts.time),max(ts.time), self.tslen_SAX)
        ts_SAX_data = interp1d(ts.time, ts.data)(x1)
        ts_SAX_time = x1
        ts_SAX = TimeSeries(ts_SAX_time,ts_SAX_data)
        if not os.path.exists(self.dbname+"_ts_SAX"):
            os.makedirs(self.dbname+"_ts_SAX")
        np.save(self.dbname+"_ts_SAX/"+pk+"_ts_SAX.npy", np.vstack((ts_SAX.time, ts_SAX.data)))

        # Save a record in the database file
        if self.overwrite or not os.path.exists(self.dbname):
            fd = open(self.dbname, 'w')
            self.overwrite = False
        else:
            fd = open(self.dbname, 'a')
        fd.write(pk+':'+self.pkfield+':'+pk+'\n')
        if 'vp' in self.schema:
            fd.write(pk+':vp:False\n')
        fd.close()

        self.rows[pk]['ts'] = ts  
        if 'vp' in self.schema:
            self.rows[pk]['vp'] = False

        self.rows_SAX[pk]['ts'] = ts_SAX  
        rep = isax_indb(ts_SAX,self.card,self.wordlength)
        self.SAX_tree.insert(pk, rep)
        if 'vp' in self.schema:
            self.rows_SAX[pk]['vp'] = False

        for vp in self.vps:
            ts1 = self.rows[vp]['ts']
            self.upsert_meta(pk, {'d_vp-'+vp : self.dist(ts1,ts)})

        self.update_indices(pk)