def main(): ###################### ###### SETTINGS ###### ###################### clustered_data_folder = "../Data_Clustered/" # Base folder of clustered data filename = "MayDec2015_htv.csv" # The file to load feature = SourceFeatures.SOURCEHTAQNV args = parse_args() source_stability = args["source_stability"] output_filename = filename + feature + str(source_stability) + ".csv" ###################### ######## CODE ######## ###################### # Load file into a data frame path = clustered_data_folder + filename df = ld.read_data_from_csv(path, None, None) df = ld.fill_columns(df, None, fill_nan_with_zeros=True) df = ld.convert_column_types(df) selector_stability = df[ ProcessingFeatures.SOURCE_STABILITY] == source_stability selector_running = df[ProcessingFeatures.SOURCE_RUNNING] == 1 df_new = df.loc[selector_stability & selector_running, feature].copy() df_new.to_csv(output_filename, header=True)
def train(): with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Define the filename_queue here filenames = [] filenames_queue = tf.train.string_input_producer(filenames) #Get images and labels for training features, label = load_data.read_data_from_csv(filename_queue) input_batch, label_batch = load_data.make_batch( features, label, min_queue_examples=1000, batch_size=100) # Build a graph that computes predicted energy predicted_energy = main_functions.neural_net(input_batch) # Calculate loss loss = main_functions.loss(input_batch, label_batch) #Build a graph that trains the model with one batch of data # and updates the model parameters train_op = main_functions.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """This class logs loss and runtime""" def begin(self): self._step = -1 def before_run(self, run_context): #Asks for loss value before each run self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) #runs to fetch loss def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def main(): input_file = "../Data_Raw/Nov2018.csv" columns = [ SourceFeatures.TIMESTAMP, SourceFeatures.BCT05_CURRENT, SourceFeatures.SOURCEHTAQNV, SourceFeatures.SOURCEHTAQNI, SourceFeatures.SPARK_COUNTER, ] df = ld.read_data_from_csv(input_file, columns, None) df = ld.fill_columns(df, None, fill_nan_with_zeros=True) df = ld.convert_column_types(df) source_running = calculate_source_running(df[SourceFeatures.BCT05_CURRENT]) window_size = 20 threshold = 0.25 breakdowns = detect_breakdowns(df, SourceFeatures.SOURCEHTAQNI, window_size, threshold).astype("int64") threshold = 1000 df[ProcessingFeatures.HT_SPARKS_COUNTER] = detect_sparks( df[SourceFeatures.SOURCEHTAQNV], breakdowns, threshold) df.loc[df[ProcessingFeatures.HT_SPARKS_COUNTER] == 0, ProcessingFeatures.HT_SPARKS_COUNTER, ] = np.nan df.loc[df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0, ProcessingFeatures.HT_SPARKS_COUNTER, ] = np.arange( 1, (df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0).sum() + 1) df[ProcessingFeatures.HT_SPARKS_COUNTER] = df[ ProcessingFeatures.HT_SPARKS_COUNTER].ffill() df.loc[df[SourceFeatures.SPARK_COUNTER] == df[SourceFeatures.SPARK_COUNTER].shift(1), SourceFeatures.SPARK_COUNTER, ] = np.nan df.loc[df[SourceFeatures.SPARK_COUNTER] == 0, SourceFeatures.SPARK_COUNTER] = np.nan df.loc[df[SourceFeatures.SPARK_COUNTER] > 0, SourceFeatures.SPARK_COUNTER] = np.arange( 1, (df[SourceFeatures.SPARK_COUNTER] > 0).sum() + 1) df[SourceFeatures.SPARK_COUNTER] = df[SourceFeatures.SPARK_COUNTER].ffill() fig, ax = plt.subplots(2, 1, sharex=True) ax_htv = ax[0].twinx() ax_hti = ax[0].twinx() ax_hti.spines["right"].set_position(("axes", 1.04)) # make_patch_spines_invisible(par2) ax_hti.spines["right"].set_visible(True) # ax[0].plot(df[ProcessingFeatures.HT_SPARKS_COUNTER], color='red') ax[0].plot(df[SourceFeatures.BCT05_CURRENT], color="red") ax_htv.plot(df[SourceFeatures.SOURCEHTAQNV]) ax_hti.plot(df[SourceFeatures.SOURCEHTAQNI], color="orange") sparks_real = df[SourceFeatures.SPARK_COUNTER] ax12 = ax[1].twinx() ax[1].plot(df[ProcessingFeatures.HT_SPARKS_COUNTER], color="red") ax[1].plot(sparks_real, color="orange") plt.show()
def main(): ###################### ###### SETTINGS ###### ###################### clustered_data_folder = "../Data_Clustered/" # Base folder of clustered data filename = "JanNov2018.csv" # The file to load ###################### ######## CODE ######## ###################### columns = [ SourceFeatures.TIMESTAMP, SourceFeatures.BCT25_CURRENT, ProcessingFeatures.SOURCE_STABILITY, ] # Load file into a data frame path = clustered_data_folder + filename df = ld.read_data_from_csv(path, columns, None) df = ld.fill_columns(df, None, fill_nan_with_zeros=True) df = ld.convert_column_types(df) dates_stable = matplotlib.dates.date2num( df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 1].index.values) dates_unstable = matplotlib.dates.date2num( df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 0].index.values) fig = plt.figure() ax = fig.add_subplot("111") ax.plot_date( dates_unstable, df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 0, SourceFeatures.BCT25_CURRENT].values, fmt=".", c="red", markersize=1, ) ax.plot_date( dates_stable, df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 1, SourceFeatures.BCT25_CURRENT].values, fmt=".", c="black", markersize=1, ) ax.set_ylim(-0.01, 0.08) ax.set_ylabel("BCT25 current [µA]") figManager = plt.get_current_fig_manager() figManager.window.showMaximized() plt.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.93, wspace=None, hspace=0.4) plt.show()
def main(plot): year = 2016 start_month = "Jan" end_month = "Nov" data_path = "../Data_Raw" for i, m in enumerate( months[months.index(start_month) : months.index(end_month) + 1] ): file_path = f"{data_path}/{m}{year}.csv" print(f"HT sparks for {file_path}") previous_month_file = None if i > 0: m_prev = months[months.index(m) - 1] previous_month_file = f"{data_path}/{m_prev}{year}_htv.csv" df = ld.read_data_from_csv(file_path, None, None) df = ld.fill_columns(df, previous_month_file, fill_nan_with_zeros=True) # df = ld.convert_column_types(df) # First we mark all time periods where the variance of the HT current is # above a certain threshold to exclude all these windows from our analysis window_size = 40 threshold_breakdowns = 0.25 breakdowns = detect_breakdowns( df, SourceFeatures.SOURCEHTAQNI, window_size, threshold_breakdowns ).astype("int64") # Then we search for all downward spikes in the HT voltage that fall below 1000V # and have a prominence of 500V, i.e. are significant compared to the background. # The are the actual sparks and can be compared with IP.NSRCGEN:SPARKS for 2018 threshold_sparks = 1000 sparks = detect_sparks( df[SourceFeatures.SOURCEHTAQNV], breakdowns, threshold_sparks ) df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] = breakdowns df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] = df[ ProcessingFeatures.HT_VOLTAGE_BREAKDOWN ].astype("Int32") df[ProcessingFeatures.HT_SPARKS_COUNTER] = sparks df[ProcessingFeatures.HT_SPARKS_COUNTER] = df[ ProcessingFeatures.HT_SPARKS_COUNTER ].astype("Int32") # df.loc[df[ProcessingFeatures.HT_SPARKS_COUNTER] == 0, ProcessingFeatures.HT_SPARKS_COUNTER] = np.nan if plot: plot_breakdowns(df) mask = (df.shift(1) == df).fillna(value=True).astype(bool) df = df.where(~mask, np.nan)
def main(input_file, output_file): folder = "../Data_Clustered/" input_file = f"{folder}{input_file}.csv" output_file = f"{folder}{output_file}.csv" df = ld.read_data_from_csv(input_file, None, None) df = fill_columns(df) df = reset_breakdown_clusters(df) df = assign_clusters(df) mask = (df.shift(1) == df).fillna(value=True).astype(bool) df = df.where(~mask, np.nan) # df = df.round(4) df.to_csv(output_file)
def main(): ###################### ###### SETTINGS ###### ###################### clustered_data_folder = "../Data_Clustered/" # Base folder of clustered data filename = "JanNov2016.csv" # The file to load features = [ SourceFeatures.BIASDISCAQNV, SourceFeatures.GASAQN, SourceFeatures.OVEN1AQNP, SourceFeatures.THOMSON_FORWARDPOWER, SourceFeatures.SOLINJ_CURRENT, SourceFeatures.SOLCEN_CURRENT, SourceFeatures.SOLEXT_CURRENT, SourceFeatures.SOURCEHTAQNI, SourceFeatures.BCT25_CURRENT, ] # Features to be displayed args = parse_args() source_stability = args["source_stability"] cluster = args["cluster"] sample_size = args["sample_size"] ###################### ######## CODE ######## ###################### path = clustered_data_folder + filename df = ld.read_data_from_csv(path, None, None) df = ld.fill_columns(df, None, fill_nan_with_zeros=True) df = ld.convert_column_types(df) df = df.loc[(df[ProcessingFeatures.SOURCE_STABILITY] == source_stability)].copy() if not cluster is None: df = df.loc[(df[ProcessingFeatures.CLUSTER] == cluster)].copy() index_length = len(df.index) indices = np.random.permutation(range(index_length))[ : min(sample_size, index_length) ] data = df.loc[df.index[indices]].copy() sns.pairplot(data, vars=features, hue=ProcessingFeatures.CLUSTER) plt.show()
def main(year, source_stability, cluster, show_breakdowns): ###################### ###### SETTINGS ###### ###################### if year == 2018: input_file = "../Data_Clustered/JanNov2018_sparks_clustered.csv" # features.append(SourceFeatures.SAIREM2_FORWARDPOWER) elif year == 2016: input_file = "../Data_Clustered/JanNov2016_sparks_clustered.csv" # features.append(SourceFeatures.THOMSON_FORWARDPOWER) features = [ # SourceFeatures.BIASDISCAQNV, # SourceFeatures.GASAQN, # SourceFeatures.OVEN1AQNP, # SourceFeatures.OVEN2AQNP, # SourceFeatures.SOLINJ_CURRENT, # SourceFeatures.SOLCEN_CURRENT, SourceFeatures.SOLEXT_CURRENT, SourceFeatures.SOURCEHTAQNV, SourceFeatures.SOURCEHTAQNI, ] # Features to be displayed features.append(SourceFeatures.BCT25_CURRENT) ###################### ######## CODE ######## ###################### # Load file into a data frame df = ld.read_data_from_csv(input_file, None, None) df = ld.fill_columns(df, None, fill_nan_with_zeros=True) df = ld.convert_column_types(df) if cluster is not None: df = df[(df[ProcessingFeatures.CLUSTER] == cluster)].copy() df = df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == source_stability].copy() dates_nobreakdown = matplotlib.dates.date2num( df[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0].index) dates_breakdown = matplotlib.dates.date2num( df[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0].index) dates = df.index.values # datesIndices = np.arange(len(dates)) df[SourceFeatures.BCT25_CURRENT] *= 1000 fig, ax = plt.subplots(len(features), 1, sharex=True, figsize=(6, 6)) for i, parameter in enumerate(features): # formatter = DateFormatter(dates) # ax[i].xaxis.set_major_formatter(formatter) ax[i].set_ylabel("{}".format(parameter), labelpad=40, fontsize=24) ax[i].set_xlabel("", labelpad=40, fontsize=24) ax[i].tick_params(axis="both", which="major", labelsize=22) if show_breakdowns: # ax[i].plot(datesIndices[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0], df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0, parameter].values, linestyle='', marker='.', markersize=1, color='#ff7f0e') ax[i].plot_date( dates_breakdown, df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0, parameter].values, linestyle="", marker=".", markersize=1, color="red", ) ax[i].plot_date( dates_nobreakdown, df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0, parameter].values, linestyle="", marker=".", markersize=1, color="black", ) if show_breakdowns: ymin, ymax = ax[i].get_ylim() ax[i].vlines( df[df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0].index, ymin=ymin, ymax=ymax, color="black", ls="dashed", linewidths=1, ) # ax[i].plot(datesIndices[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0], df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0, parameter].values, linestyle='', marker='.', markersize=1, color='#1f77b4') ax[i].grid(True) ax[i].xaxis.set_major_locator( mdates.HourLocator(interval=24)) # to get a tick every 24 hours ax[i].xaxis.set_major_formatter(mdates.DateFormatter("%d-%m %H:00")) figManager = plt.get_current_fig_manager() # figManager.window.showMaximized() # fig.suptitle("Time development of cluster {}".format(cluster)) plt.tight_layout() fig.align_ylabels() plt.show()
class_1_count = 0 for sent, label in zip(sents_tokenized_train, labels_train): if label == 0: train_data_eq.append(sent) train_labels_eq.append(label) elif label == 1: if class_1_count <= class_1_threshold: train_data_eq.append(sent) train_labels_eq.append(label) class_1_count += 1 return train_data_eq, train_labels_eq if __name__ == '__main__': sents_tokenized_train, labels_train = read_data_from_csv( config.cola_tokenized_tsv_filename_train) sents_tokenized_dev, labels_dev = read_data_from_csv( config.cola_tokenized_tsv_filename_dev) # equalize class counts sents_tokenized_train, labels_train = equalize_class_data( sents_tokenized_train, labels_train) # shuffle the data sents_tokenized_train, labels_train = shuffle_data( sents_tokenized_train, labels_train) cv = CountVectorizer(analyzer='word', tokenizer=dummy_placeholder_func, preprocessor=dummy_placeholder_func, token_pattern=None, ngram_range=(1, 3)) cvX_train = cv.fit_transform(sents_tokenized_train) cvX_dev = cv.transform(sents_tokenized_dev)
def main(): ###################### ###### SETTINGS ###### ###################### clustered_data_folder = "Data_Clustered/" # Base folder of clustered data filename = "JanNov2018.csv" # The file to load source_stability = 1 # 1 if we want to look at a stable source, 0 else cluster = 51 # The cluster to plot or None if you want to plot all data features = [ SourceFeatures.BIASDISCAQNV, SourceFeatures.GASAQN, SourceFeatures.OVEN1AQNP, SourceFeatures.SAIREM2_FORWARDPOWER, SourceFeatures.SOLINJ_CURRENT, SourceFeatures.SOLCEN_CURRENT, SourceFeatures.SOLEXT_CURRENT, SourceFeatures.SOURCEHTAQNI, SourceFeatures.BCT25_CURRENT, ] # Features to be displayed normalize = True # Do we want to standard scale the data? bandwidth = np.array( [0.014, 0.011, 0.014, 0.014, 0.014, 0.014, 0.014, 0.014, 0.014]) # bandwidth for unnormalized data # bandwidth = 0.02 ###################### ######## CODE ######## ###################### # Load file into a data frame path = clustered_data_folder + filename df = ld.read_data_from_csv(path, None, None) df = ld.fill_columns(df, None, fill_nan_with_zeros=True) df = ld.convert_column_types(df) df = df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == source_stability, :].copy() total_duration = df[ProcessingFeatures.DATAPOINT_DURATION].sum() data = df[features].values weights = df[ProcessingFeatures.DATAPOINT_DURATION].values if normalize: # data = (data - np.mean(data, axis=0)) / np.std(data, axis=0) #Standard scaling # data = (data - np.min(data, axis=0)) / (np.max(data, axis=0) - np.min(data, axis=0)) #MinMax scaling # data = data / np.max(np.absolute(data), axis=0) #Max scaling data = (data - np.median(data, axis=0)) / ( np.quantile(data, q=0.9, axis=0) - np.quantile(data, q=0.1, axis=0) ) # Robust scaler if cluster is not None: data = data[df[ProcessingFeatures.CLUSTER] == cluster] weights = weights[df[ProcessingFeatures.CLUSTER] == cluster] resolution = 5000 # if cluster is not None: # bandwidth *= 0.2 num_kde_samples = 40000 cluster_duration = np.sum(weights) percentage_of_values = cluster_duration / total_duration plot_cluster( data, weights, features, feature_ranges=None, median=None, resolution=resolution, bandwidth=bandwidth, num_kde_samples=num_kde_samples, cluster=cluster, percentage_of_values=percentage_of_values, )
def main( year, source_stability, count_breakdowns_per_cluster, num_clusters_to_visualize, print_to_file, display_metrics, ): ###################### ###### SETTINGS ###### ###################### features = [ SourceFeatures.BIASDISCAQNV, SourceFeatures.GASAQN, SourceFeatures.OVEN1AQNP, # SourceFeatures.OVEN2AQNP, SourceFeatures.SOLINJ_CURRENT, SourceFeatures.SOLCEN_CURRENT, SourceFeatures.SOLEXT_CURRENT, SourceFeatures.SOURCEHTAQNI, SourceFeatures.BCT25_CURRENT, ] # Features to load if year == 2018: input_file = "../Data_Clustered/JanNov2018_sparks_clustered.csv" output_file = "./Results/2018_{}_sparks.csv".format(source_stability) features.append(SourceFeatures.SAIREM2_FORWARDPOWER) elif year == 2016: input_file = "../Data_Clustered/JanNov2016_sparks_clustered.csv" output_file = "./Results/2016_{}_sparks.csv".format(source_stability) features.append(SourceFeatures.THOMSON_FORWARDPOWER) elif year == 2015: input_file = "../Data_Clustered/MayDec2015_sparks_clustered.csv" output_file = "./Results/2015_{}_sparks.csv".format(source_stability) features.append(SourceFeatures.THOMSON_FORWARDPOWER) statistics = ["median", "std%"] # Statistics we are interested in ###################### ######## CODE ######## ###################### # Load file into a data frame df = ld.read_data_from_csv(input_file, None, None) df = ld.fill_columns(df, None, fill_nan_with_zeros=True) df = ld.convert_column_types(df) for feature in features: if feature not in df.columns: print( "{} does not exist as a feature in the loaded file. Aborting.".format( feature ) ) return # Calculate oven refills oven_refill_ends = calculate_oven_refill_ends(df[SourceFeatures.OVEN1AQNP]) if year == 2018: oven_refill_ends = clear_refills_2018(oven_refill_ends) elif year == 2016: oven_refill_ends = clear_refills_2016(oven_refill_ends) print("There were {} oven refills.".format(len(oven_refill_ends))) # Select only the stability interested in df = df[df[ProcessingFeatures.SOURCE_STABILITY] == source_stability].copy() total_duration = df[ProcessingFeatures.DATAPOINT_DURATION].sum() / 3600 # Describe the clusters print("Calculating statistics...") described = df.groupby(ProcessingFeatures.CLUSTER).apply( describe_cluster, features=features, weight_column=ProcessingFeatures.DATAPOINT_DURATION, oven_refills=oven_refill_ends, ) described[("DENSITY", "percentage")] = ( described[("DURATION", "in_hours")] / total_duration * 100 ) # Gather statistics to output wanted_statistics = get_wanted_statistics(features, statistics) + [ ("DENSITY", "percentage"), ("DURATION", "in_hours"), ("DURATION", "longest_in_hours"), ("DURATION", "num_splits"), ("REFILL", "index"), ("REFILL", "delta_in_hours"), ] if count_breakdowns_per_cluster: wanted_statistics += [("num_breakdowns", "per_hour")] # Calculate metrics if display_metrics: metrics = calculate_metrics(df, features) print("DBI is {}".format(np.mean(metrics["DBI"]))) described.loc[described.index >= 0, ("METRICS", "DBI")] = metrics["DBI"] print("Silhouette is {}".format(np.mean(metrics["silhouette"]))) described.loc[described.index >= 0, ("METRICS", "silhouette")] = metrics[ "silhouette" ] wanted_statistics += [("METRICS", "DBI"), ("METRICS", "silhouette")] described.sort_values(by=[("DENSITY", "percentage")], ascending=False, inplace=True) print("Rounding values...") printable_clusters = described[wanted_statistics].head(n=num_clusters_to_visualize) print( "Sum of densities of printed clusters: {:.1f}%".format( printable_clusters[("DENSITY", "percentage")].sum() ) ) print( "Sum of duration of printed clusters when source was running: {:.1f}".format( printable_clusters.loc[ printable_clusters.index >= 0, ("DURATION", "in_hours") ].sum() ) ) printable_clusters = round_described( printable_clusters, { SourceFeatures.BIASDISCAQNV: 0, SourceFeatures.GASAQN: 2, SourceFeatures.OVEN1AQNP: 1, SourceFeatures.OVEN2AQNP: 1, SourceFeatures.THOMSON_FORWARDPOWER: 0, SourceFeatures.SAIREM2_FORWARDPOWER: 0, SourceFeatures.SOLINJ_CURRENT: 0, SourceFeatures.SOLCEN_CURRENT: 0, SourceFeatures.SOLEXT_CURRENT: 0, SourceFeatures.SOURCEHTAQNI: 2, SourceFeatures.BCT25_CURRENT: 3, }, ) printable_clusters.rename( { SourceFeatures.BIASDISCAQNV: "bias disc", SourceFeatures.GASAQN: "gas", SourceFeatures.OVEN1AQNP: "oven1", SourceFeatures.OVEN2AQNP: "oven2", SourceFeatures.SAIREM2_FORWARDPOWER: "RF", SourceFeatures.THOMSON_FORWARDPOWER: "RF", SourceFeatures.SOLINJ_CURRENT: "solinj", SourceFeatures.SOLCEN_CURRENT: "solcen", SourceFeatures.SOLEXT_CURRENT: "solext", SourceFeatures.SOURCEHTAQNI: "HTI", SourceFeatures.BCT25_CURRENT: "BCT25", }, axis="columns", inplace=True, ) if print_to_file: printable_clusters.to_csv(output_file) print("Saved result to {}".format(output_file)) else: print(printable_clusters)