示例#1
0
    def __init__(self, downsampling_step, sequence_length):
        loading_dataset_since = time()
        extension = 'xlsx'
        self.downsampling_step = downsampling_step
        self.sequence_length = sequence_length
        all_filenames = [i for i in glob.glob('*.{}'.format(extension))
                         ]  #find all files
        data_pd = pd.concat(
            [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames],
            ignore_index=True)  #concat all the data
        data_numpy = data_pd.to_numpy().astype(float)
        zeros_removed = remove_zeros(data_numpy)
        downsampled_data = downsample(zeros_removed, downsampling_step)
        time_series_data = split_time_series(downsampled_data, sequence_length)
        sc = StandardScaler()
        scaled_data = sc.fit_transform(time_series_data)
        scaled_data_tensor = torch.from_numpy(scaled_data)
        scaled_data_tensor_reshaped = scaled_data_tensor.unsqueeze(
            0).transpose(1, 0)
        self.len = scaled_data_tensor_reshaped.shape[0]
        self.training_data_tensor = scaled_data_tensor_reshaped
        loading_dataset_end = time()
        hours, minutes, seconds = timer(loading_dataset_since,
                                        loading_dataset_end)

        print('The length of the dataset is {}'.format(
            len(self.training_data_tensor)))
        print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds))
def cluster_edges(input_dir, output_file):
    file_names = os.listdir(input_dir)

    edges = []
    for file_name in file_names:
        edge = load_edge(file_name)
        if edge.shape[0] == 0:
            continue
        axis_aligned = helpers.axis_align_pandas(edge.sort_values(by='x'))
        edges.append(axis_aligned)

    downsampled_edges = []
    for edge in edges:
        downsampled_edges.append(
            helpers.downsample(helpers.axis_align_pandas(edge), 1000))

    total_frame_cols = get_cols_from_frame(downsampled_edges[0])
    total_frame = pd.DataFrame(columns=total_frame_cols)

    for edge in downsampled_edges:
        total_frame = total_frame.append(frame_to_row(edge), ignore_index=True)

    kmeans = KMeans(N_CLUSTERS, n_jobs=-1)
    kmeans.fit(total_frame)
    clusters = kmeans.predict(total_frame)
    pd.DataFrame({
        'filename': file_names,
        'cluster': clusters
    }).to_csv(output_file, index=False)
示例#3
0
def find_flats(input_dir):
    # Takes a directory
    # Finds lines which are probably flat

    file_names = os.listdir(input_dir)

    edges = []
    for file_name in file_names:
        edge = helpers.axis_align_pandas(
            load_edge(file_name).sort_values(by='x'))
        edges.append(edge)

    downsampled_edges = []
    for edge in edges:
        downsampled_edges.append(
            helpers.downsample(helpers.axis_align_pandas(edge), 1000))

    total_frame_cols = get_cols_from_frame(downsampled_edges[0])
    total_frame = pd.DataFrame(columns=total_frame_cols)

    for edge in downsampled_edges:
        total_frame = total_frame.append(frame_to_row(edge), ignore_index=True)

    kmeans = KMeans(N_CLUSTERS)
    kmeans.fit(total_frame)
    clusters = kmeans.predict(total_frame)
    pd.DataFrame({
        'filename': file_names,
        'cluster': clusters
    }).to_csv(output_file, index=False)
 def send_spectrogram_update(self, spec, canvas_id=None):
   spec = downsample(spec)
   spec = astype(spec)
   nblocks, nfreqs = spec.shape
   print "spec_update:::shape:", spec.shape, "ch:", canvas_id
   self.send_message('spectrogram',
                     {'action': 'update',
                      'nblocks': nblocks,
                      'nfreqs': nfreqs,
                      'canvasId': canvas_id},
                     spec.tostring())
示例#5
0
 def send_spectrogram_update(self, spec, canvas_id=None):
     spec = downsample(spec)
     spec = astype(spec)
     nblocks, nfreqs = spec.shape
     print "spec_update:::shape:", spec.shape, "ch:", canvas_id
     self.send_message(
         'spectrogram', {
             'action': 'update',
             'nblocks': nblocks,
             'nfreqs': nfreqs,
             'canvasId': canvas_id
         }, spec.tostring())
示例#6
0
def seq_query():

    seq_hash = str(request.args["seq_hash"])
    method = request.args["method"]

    # takes a seq hash and returns a downsampled region
    logging.debug(f"Getting data for seq ID {seq_hash}")

    if LOCAL:
        df = pd.read_parquet(f"data/{seq_hash}.{method}.parquet.sz")
    else:
        df = query_x_range(f"{seq_hash}.{method}.parquet.sz",
                           request.args.get("x_min"),
                           request.args.get("x_max"))

    logging.debug("Got the data")

    zone = df.loc[(float(request.args.get("x_max", df.x.max())) >= df.x) & (
        float(request.args.get("x_min", df.x.min())) <= df.x)].values.tolist()
    return jsonify((seq_hash, downsample(zone)))
示例#7
0
def transform_route():
    sequence = request.form["seq"]
    seq_name = request.form["seq_name"]
    method = request.form["method"]

    logging.debug("Hashing seq")
    seq_hash = str(xxhash.xxh64(sequence).intdigest())

    if LOCAL:
        exists = os.path.exists(f"data/{seq_hash}.{method}.parquet.sz")
        logging.debug(f"Found {seq_hash} locally")
    else:
        exists = exists_on_s3(f"{seq_hash}.{method}.parquet.sz")
        logging.debug(f"Found {seq_hash} on S3")

    if exists:
        if LOCAL:
            df = pd.read_parquet(f"data/{seq_hash}.{method}.parquet.sz")
        else:
            df = query_x_range(f"{seq_hash}.{method}.parquet.sz")

    else:
        logging.debug(
            f"No previous transformation for {seq_name} found. Transforming..."
        )
        transformed = transform(sequence, method=method)

        logging.debug("Saving transformed data for " + seq_name)
        df = pd.DataFrame(dict(x=transformed[0], y=transformed[1]))
        df.to_parquet(f"data/{seq_hash}.{method}.parquet.sz")

        if not LOCAL:
            logging.debug(f"Uploading {seq_hash} to S3")
            upload(f"{seq_hash}.{method}.parquet.sz")

    logging.debug(f"Got the overview data for {seq_hash}")

    zone = df.values.tolist()
    return jsonify((seq_hash, downsample(zone)))
示例#8
0
    def __init__(self,
                 downsampling_step,
                 sequence_length,
                 train=True,
                 normalize=False):
        loading_dataset_since = time()
        extension = 'xlsx'

        self.downsampling_step = downsampling_step
        self.sequence_length = sequence_length

        #find all files and concatenate
        all_filenames = [i for i in glob.glob('*{}'.format(extension))]

        data = pd.concat(
            [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames],
            ignore_index=True)

        #extract torque and label
        torque = data.iloc[:, 0].to_numpy().astype(float)
        label = data.iloc[:, 1].to_numpy().astype(float)

        #remove zeros from torque and label
        label = np.delete(label, np.where(torque == 0))
        torque = remove_zeros(torque)

        #expand dimension and store the zero removed data
        torque = np.expand_dims(torque, axis=1)
        label = np.expand_dims(label, axis=1)
        data = np.append(torque, label, axis=1)

        #find the normal and anomalous labeled sequences and divide the data into segments'
        segmented_list = consecutive(
            (np.where(data[:, 1] == 0))[0]) + consecutive(
                (np.where(data[:, 1] == 1))[0])
        segmented_list.sort(key=lambda segmented_list: segmented_list[1])
        segmented_data = []
        for i in range(len(segmented_list)):
            segments = segmented_list[i]
            start_index = segments[0]
            end_index = segments[len(segments) - 1]
            seg_data = data[start_index:end_index + 1, :]
            segmented_data.append(seg_data)

        #downsample the data and make sequences'
        sequenced_data = []
        for i in range(len(segmented_data)):
            label = segmented_data[i][0, 1]
            data = downsample(segmented_data[i][:, 0], self.downsampling_step)
            data = split_time_series(data, self.sequence_length)
            if label == 0.:
                label_column = [0] * len(data)
            else:
                label_column = [1] * len(data)

            sequenced_data.append(np.column_stack((data, label_column)))

        data = np.empty((0, self.sequence_length + 1))

        for i in range(len(sequenced_data)):
            if sequenced_data[i].shape[1] == self.sequence_length + 1:
                data = np.append(data, sequenced_data[i], axis=0)

        if normalize:
            #scale the data and return the tensor output'
            sc = StandardScaler()

            training_data = data[0:int(0.7 * (len(data))),
                                 0:self.sequence_length]
            testing_data = data[int(0.7 * (len(data))):,
                                0:self.sequence_length]

            training_label = data[0:int(0.7 * (len(data))), -1]
            testing_label = data[int(0.7 * (len(data))):, -1]

            sc_fit = sc.fit(training_data)

            if train:
                unlabeled_data = sc_fit.transform(training_data)
                data = np.column_stack((unlabeled_data, training_label))
            else:
                unlabeled_data = sc_fit.transform(testing_data)
                data = np.column_stack((unlabeled_data, testing_label))
        else:
            if train:
                data = data[0:int(0.7 * (len(data))), :]
            else:
                data = data[int(0.7 * (len(data))):, :]
        data = torch.from_numpy(data).unsqueeze(0).transpose(1, 0)

        self.len = data.shape[0]
        self.data = data

        loading_dataset_end = time()
        hours, minutes, seconds = timer(loading_dataset_since,
                                        loading_dataset_end)

        print('The length of the dataset is {}'.format(self.len))
        print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds))
示例#9
0
    def __init__(self,
                 sequence_length,
                 downsampling_step=10,
                 train=True,
                 normalize=True):

        self.sequence_length = sequence_length
        self.downsampling_step = downsampling_step
        self.train = train
        self.normalize = normalize

        #load the data
        if self.train:
            #fault free training data

            load = pyreadr.read_r(
                '.\TE_Data_full\TEP_FaultFree_Training.RData')
            load = load['fault_free_training']
            temp_data = np.asarray(load, dtype=np.float32)[:, 3:]
            #temp_label = np.asarray(load, dtype=np.int32)[:,0]

            if self.normalize:
                sc = StandardScaler()
                temp_data = sc.fit_transform(temp_data)

            fault_free_training = temp_data
            del (temp_data)

            #make sequences
            sequenced_data_list = []

            for variables in fault_free_training.T:
                temp_data_d = downsample(variables, self.downsampling_step)
                temp_data = split_time_series(temp_data_d,
                                              self.sequence_length)
                temp_data = np.expand_dims(temp_data, axis=1)
                sequenced_data_list.append(temp_data)

            fault_free_training = torch.empty(
                ((sequenced_data_list[0].shape[0]), 0, self.sequence_length))

            for sequences in sequenced_data_list:
                fault_free_training = torch.cat(
                    (fault_free_training, torch.from_numpy(sequences)), dim=1)

            fault_free_label = np.zeros(fault_free_training.shape[0],
                                        dtype=np.int32)

            del (sequenced_data_list)
            del (temp_data)

            #faulty training data

            load = pyreadr.read_r('.\TE_Data_full\TEP_Faulty_Training.RData')
            load = load['faulty_training']
            temp_data = np.asarray(load, dtype=np.float32)[:, 3:]

            temp_label = np.asarray(load, dtype=np.int32)[:, 0]

            if self.normalize:
                temp_data = sc.fit_transform(temp_data)

            faulty_training = temp_data
            del (temp_data)

            #make sequences
            sequenced_data_list = []

            for variables in faulty_training.T:
                temp_data_d = downsample(variables, self.downsampling_step)
                temp_data = split_time_series(temp_data_d,
                                              self.sequence_length)
                temp_data = np.expand_dims(temp_data, axis=1)
                sequenced_data_list.append(temp_data)

            faulty_training = torch.empty(
                (sequenced_data_list[0].shape[0], 0, self.sequence_length))

            for sequences in sequenced_data_list:

                faulty_training = torch.cat(
                    (faulty_training, torch.from_numpy(sequences)), dim=1)

            faulty_label = np.ones(faulty_training.shape[0], dtype=np.int32)

            del (sequenced_data_list)
            del (temp_data)

            training_data = torch.cat((fault_free_training, faulty_training),
                                      dim=0)
            training_label = np.concatenate((fault_free_label, faulty_label),
                                            axis=0)

            self.data = training_data
            self.label = torch.from_numpy(training_label)
            self.len = len(self.data)
            print('The length of the dataset is {}'.format(self.len))

        else:

            #fault free testing data

            load = pyreadr.read_r('.\TE_Data_full\TEP_FaultFree_Testing.RData')
            load = load['fault_free_testing']
            temp_data = np.asarray(load, dtype=np.float32)[:, 3:]
            #temp_label = np.asarray(load, dtype=np.int32)[:,0]

            if self.normalize:
                sc = StandardScaler()
                temp_data = sc.fit_transform(temp_data)

            fault_free_testing = temp_data
            del (temp_data)

            #make sequences
            sequenced_data_list = []

            for variables in fault_free_testing.T:
                temp_data_d = downsample(variables, self.downsampling_step)
                temp_data = split_time_series(temp_data_d,
                                              self.sequence_length)
                temp_data = np.expand_dims(temp_data, axis=1)
                sequenced_data_list.append(temp_data)

            fault_free_testing = torch.empty(
                ((sequenced_data_list[0].shape[0]), 0, self.sequence_length))

            for sequences in sequenced_data_list:
                fault_free_testing = torch.cat(
                    (fault_free_testing, torch.from_numpy(sequences)), dim=1)

            fault_free_label = np.zeros(fault_free_testing.shape[0],
                                        dtype=np.int32)

            del (sequenced_data_list)
            del (temp_data)

            #faulty testing data

            load = pyreadr.read_r('.\TE_Data_full\TEP_Faulty_Testing.RData')
            load = load['faulty_testing']
            temp_data = np.asarray(load, dtype=np.float32)[:, 3:]

            #temp_label = np.asarray(load, dtype=np.int32)[:,0]

            if self.normalize:
                sc = StandardScaler()
                temp_data = sc.fit_transform(temp_data)

            faulty_testing = temp_data
            del (temp_data)

            #make sequences
            sequenced_data_list = []

            for variables in faulty_testing.T:
                temp_data_d = downsample(variables, self.downsampling_step)
                temp_data = split_time_series(temp_data_d,
                                              self.sequence_length)
                temp_data = np.expand_dims(temp_data, axis=1)
                sequenced_data_list.append(temp_data)

            faulty_testing = torch.empty(
                (sequenced_data_list[0].shape[0], 0, self.sequence_length))

            for sequences in sequenced_data_list:

                faulty_testing = torch.cat(
                    (faulty_testing, torch.from_numpy(sequences)), dim=1)

            faulty_label = np.ones(faulty_testing.shape[0], dtype=np.int32)

            del (sequenced_data_list)
            del (temp_data)

            testing_data = torch.cat((fault_free_testing, faulty_testing),
                                     dim=0)
            testing_label = np.concatenate((fault_free_label, faulty_label),
                                           axis=0)

            self.data = testing_data
            self.label = torch.from_numpy(testing_label)
            self.len = len(self.data)
            print('The length of the dataset is {}'.format(self.len))