def open_sim_out_file(chan): global out_file datestr = constants.get_date_str() graphs_results_dir = constants.path_result_dir + datestr + constants.path_graphs_dir out_dir_path = graphs_results_dir + constants.path_sim_graphs_dir out_file_name = out_dir_path + "similarity_report_" + chan + "_" + datestr + ".csv" out_file = open(out_file_name, "w")
def create_graph_directory(): datestr = constants.get_date_str() graphs_results_dir = constants.path_result_dir + datestr + constants.path_graphs_dir grap_types_dirs = { \ constants.path_radar_graphs_dir, \ constants.path_sim_graphs_dir, \ constants.path_scatered_graphs_dir, \ constants.path_random_forest_graphs_dir \ } if not os.path.exists(graphs_results_dir): os.makedirs(graphs_results_dir) for dir_name in grap_types_dirs: path = graphs_results_dir + dir_name if not os.path.exists(path): os.makedirs(path)
def create_scatter_chart(dataframe, feature, sorter): datestr = constants.get_date_str() graphs_results_dir = constants.path_result_dir + datestr + constants.path_graphs_dir out_dir_path = graphs_results_dir + constants.path_scatered_graphs_dir if not os.path.exists(out_dir_path): os.makedirs(out_dir_path) figscatter = px.scatter(dataframe, x=constants.channel_out_col_title, y=feature, color=sorter, height=1000, width=1600) figscatter.update_layout(margin=dict(l=20, r=20, t=20, b=20)) scatter_chart_file_name = out_dir_path + '\\' + feature + '_scatter_chart.png' figscatter.write_image(file=scatter_chart_file_name, format='png')
def create_radar_charts(dataframe, feature, sorter): datestr = constants.get_date_str() graphs_results_dir = constants.path_result_dir + datestr + constants.path_graphs_dir out_dir_path = graphs_results_dir + constants.path_radar_graphs_dir if not os.path.exists(out_dir_path): os.makedirs(out_dir_path) #fig = go.Figure() fig = px.scatter_polar(dataframe, r=feature, theta=constants.channel_out_col_title, color=sorter, height=1600, width=1800) #fig = px.line_polar(dataframe, #r=feature, #theta="QCM", #color="Product", #height=1600, width=1600, #) #for product in dataframe['Product'].tolist(): #rows_array = dataframe.loc[dataframe['Product'] == product] #for q in channels_list: #q_rows = rows_array.loc[rows_array['QCM'] == q] #for index, row in q_rows.iterrows(): #print(row) ##fig.show() fig.update_traces(mode="lines+markers", marker=dict(symbol="diamond-open", size=6)) fig.update_layout(polar=dict(radialaxis=dict(visible=True)), showlegend=True, title=feature, font=dict(family="Courier New, monospace", size=32, color="Black"), margin=dict(l=270, r=80, t=80, b=20)) radar_chart_file_name = out_dir_path + '\\' + feature + '_radar_chart.png' fig.write_image(file=radar_chart_file_name, format='png')
def sort_samples(samples_array, sorter): prot = feature_extractor.protocol_attr() for sample in samples_array: if sample.sampler_type not in sorted_samples: sorted_samples[sample.sampler_type] = {} brand_prod = sample.brand + "_" + sample.product if brand_prod not in sorted_samples[sample.sampler_type]: sorted_samples[sample.sampler_type][brand_prod] = {} for channel in sample.values.columns[1:]: if (sample.values[channel] == 0).all(): continue card_channel = sample.card + "_" + channel if card_channel not in sorted_samples[ sample.sampler_type][brand_prod]: sorted_samples[ sample.sampler_type][brand_prod][card_channel] = [] ch_data = channel_data() ch_data.sample_id = sample.ID ch_data.note = sample.note ch_data.tags = sample.tags ch_data.values = sample.values[["time", channel]] ch_data.values[channel] -= ch_data.values[channel][30] ch_data.values[channel] = signal_process.smooth( ch_data.values[channel]) ch_data.derviate_1 = signal_process.get_derivative_1( ch_data.values[channel]) ch_data.derviate_2 = signal_process.get_derivative_2( ch_data.values[channel]) ch_data.picks_list = feature_extractor.get_picks_indexes( ch_data, 0, ch_data.values.size) ch_data.protocol = prot feature_extractor.extract_features(ch_data, prot) sorted_samples[ sample.sampler_type][brand_prod][card_channel].append(ch_data) datestr = constants.get_date_str() features_results_dir = constants.path_result_dir + datestr + constants.path_features_dir features_file_name = features_results_dir + "features_" + "_" + datestr + ".csv" if not os.path.exists(features_results_dir): os.makedirs(features_results_dir) feature_extractor.flush_features_data_frame(features_file_name, sorter)
def create_mean_graphs(chan_array, sorter): tags_dict = {} first_ch_data = chan_array[0] card = sample_file_parser.get_sample_card(first_ch_data.sample_id) if sorter == sample_file_parser.tags_col_name: product = sample_file_parser.get_sample_prod(first_ch_data.sample_id) amount_of_samples = len(chan_array) channel = first_ch_data.values.columns[1] for ch_data in chan_array: tagstr = sample_file_parser.get_sample_tag(ch_data.sample_id) if sorter == sample_file_parser.product_col_name: product = sample_file_parser.get_sample_prod(ch_data.sample_id) tags = tagstr + "_" + product else: tags = tagstr if tags not in tags_dict: tags_dict[tags] = {} tags_dict[tags][similarity.raw_data_key] = [] tags_dict[tags][similarity.first_derivative_key] = [] tags_dict[tags][similarity.second_derivative_key] = [] line = np.array(ch_data.values[ch_data.values.columns[1]]) tags_dict[tags][similarity.raw_data_key].append(line.copy()) line = np.array(ch_data.derviate_1) tags_dict[tags][similarity.first_derivative_key].append(line.copy()) line = np.array(ch_data.derviate_2) tags_dict[tags][similarity.second_derivative_key].append(line.copy()) mean_lines = {} for tags in tags_dict: mean_lines[tags] = {} for key in tags_dict[tags]: line_list = list(tags_dict[tags][key]) length = len(line_list) sum_array = np.sum(line_list, axis=0) mean_line = sum_array / length mean_lines[tags][key] = mean_line figW = 18 figH = 10 fig, ax = plt.subplots(3, 1, figsize=(figW, figH)) time_array = get_time_array() for tags in mean_lines: i = 0 for data_type in mean_lines[tags]: line = mean_lines[tags][data_type] ax[i].plot(time_array[0:509], line[0:509], label=tags) ax[i].legend(loc='upper left', bbox_to_anchor=(-0.2, 1.3)) i += 1 ax[0].set_title(similarity.raw_data_key) ax[1].set_title(similarity.first_derivative_key) ax[2].set_title(similarity.second_derivative_key) plt.subplots_adjust(hspace=0.3, left=0.18, bottom=1 / figH, top=1 - 1 / figH) if sorter == sample_file_parser.tags_col_name: suptitle = "card: " + card + ", channel: " + channel + ", Product: " + product + "\n" + "amount of samples = " + str( amount_of_samples) else: suptitle = "card: " + card + ", channel: " + channel + "\n" + "amount of samples = " + str( amount_of_samples) fig.suptitle(suptitle, y=1) fig.canvas.set_window_title("Average graphs") datestr = constants.get_date_str() graphs_results_dir = constants.path_result_dir + datestr + constants.path_graphs_dir out_dir_path = graphs_results_dir + constants.path_average_graphs_dir if not os.path.exists(out_dir_path): os.makedirs(out_dir_path) if sorter == sample_file_parser.tags_col_name: card_chan_prod = card + "_" + channel + "_" + product else: card_chan_prod = card + "_" + channel out_file_name = out_dir_path + "average_graph_" + card_chan_prod + "_" + datestr + ".png" plt.savefig(out_file_name)
def create_sim_plots(groups_dict, chan): global time_array, markers_list, markers_list_index, prod_tag_markers global card, channel global line_color, color_name_index figW = 18 figH = 10 fig, ax = plt.subplots(3, 1, figsize=(figW, figH)) color_name_index = 0 prod_tag_markers.clear() markers_list_index = 0 card = "" channel = "" group_statistics = "\n" keys_list = [ similarity.raw_data_key, similarity.first_derivative_key, similarity.second_derivative_key ] sub_plot_titles_pre_list = [ "Sample relative data", "1st derivative", "2nd derivative" ] xpositions = [] subplot_index = 0 open_sim_out_file(chan) for key in keys_list: group_statistics = "\n" color_name_index = 0 for group in groups_dict[key]: add_group_data(groups_dict[key][group], key) color_name_index += 1 while names[color_name_index] in light_colors: color_name_index += 1 Marker = '' group_statistics += group + " members->" + str( groups_dict[key][group].members_count) + "\n" for ch_data in groups_dict[key][group].chan_data_array: if len(xpositions) == 0: xpositions = feature_extractor.calculate_prot_timing( ch_data.protocol) add_graph_line(ax, ch_data, key) add_row(key, group, ch_data) #ax[subplot_index].set_title(sub_plot_titles_pre_list[subplot_index] + group_statistics) ax[subplot_index].set_title(sub_plot_titles_pre_list[subplot_index]) subplot_index += 1 close_sim_out_file() for i in range(3): #for xc in ch_data.picks_list: #ax[i].axvline(x=xc/10, color='k', linestyle='--') for xp in xpositions: ax[i].axvline(x=xp / 10, color='r', linestyle='--') ax[i].axhline(y=0, color='green', linestyle='dotted') ax[0].legend(loc='upper left', bbox_to_anchor=(-0.2, 1.3)) ax[1].legend(loc='upper left', bbox_to_anchor=(0.8, 1.8)) ax[2].legend(loc='upper left', bbox_to_anchor=(0, 1.8)) plt.subplots_adjust(hspace=0.3, left=0.18, bottom=1 / figH, top=1 - 1 / figH) fig.suptitle("card: " + card + ", channel: " + channel, y=1) plt.get_current_fig_manager().full_screen_toggle() datestr = constants.get_date_str() graphs_results_dir = constants.path_result_dir + datestr + constants.path_graphs_dir graph_file_name = "simgraph_" + card + "_" + channel + "_" + datestr + ".png" out_dir_path = graphs_results_dir + constants.path_sim_graphs_dir plt.savefig(out_dir_path + graph_file_name) #plt.show() plt.close(fig)
#if col != 'time': #samp.values[col] = signal_process.smooth(samp.values[col]) sample.sort_samples(Samples, sample_file_parser.product_col_name) graphs_creator.create_sim_graphs(graphs_creator.split_by_prod) #graphs_creator.create_sim_graphs(graphs_creator.split_by_tag) #print(sample.sorted_samples['prototype_1_aromabit']['Kanaf_Tilapia']['qcm_3'][0].values) #similarity.group_by_similarity(sample.sorted_samples['prototype_1_aromabit']['Kanaf_Tilapia']['qcm_3']) #for prod in sample.sorted_samples['prototype_1_aromabit']: #print("==============Product = " + prod + "===============") #for chan in sample.sorted_samples['prototype_1_aromabit'][prod]: #print( "----------------------Chan = " + chan + "---------------------") #similarity.group_by_similarity(sample.sorted_samples['prototype_1_aromabit'][prod][chan]) datestr = constants.get_date_str() out_dir_path = constants.path_result_dir + datestr out_file_name = out_dir_path + "/badsamples_report_" + datestr + ".csv" out_file = open(out_file_name, "w") head_line = "sample id, data type,chan card,tags,refcount\n" out_file.write(head_line) for sample_key in sorted(similarity.bad_sample_dict): bad_samp = similarity.bad_sample_dict[sample_key] line = str( bad_samp.sample_id ) + "," + bad_samp.key + "," + bad_samp.chan_card + "," + bad_samp.tags + "," + str( bad_samp.refcount) + "\n" out_file.write(line) print(line) out_file.close()
def runRandomForest(dataframe): featureList = [] tagsCoulmnIndex = dataframe.columns.get_loc( sample_file_parser.tags_col_name) channels_list = dataframe[constants.channel_out_col_title].unique() features_list = dataframe.columns[(tagsCoulmnIndex + 1):] for channel in channels_list: for feature in dataframe.columns[(tagsCoulmnIndex + 1):]: featureList.append(channel + "_" + feature) randomForestDF = pd.DataFrame( columns=[sample_file_parser.product_col_name] + [sample_file_parser.tags_col_name] + featureList) for product in dataframe[sample_file_parser.product_col_name].unique(): rows_array = dataframe.loc[dataframe[ sample_file_parser.product_col_name] == product] out_rows = [] for channel in rows_array[constants.channel_out_col_title].unique(): channel_rows = rows_array.loc[rows_array[ constants.channel_out_col_title] == channel] i = 0 for index, row in channel_rows.iterrows(): row_flat = row.values.tolist() if len(out_rows) < (i + 1): out_rows.append(row_flat[2:]) else: if len(out_rows[i]) == 0: out_rows[i] += row_flat[2:] else: out_rows[i] += row_flat[4:] i += 1 j = 0 while j < i: randomForestDF.loc[len(randomForestDF)] = out_rows[j] j += 1 #RandomForest x = randomForestDF[featureList] #y = randomForestDF[ sample_file_parser.tags_col_name] y = randomForestDF[sample_file_parser.product_col_name] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1) clf = RandomForestClassifier(n_estimators=1000) #Train the model using the training sets y_pred=clf.predict(X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Model Accuracy, how often is the classifier correct? print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) feature_imp = pd.Series(clf.feature_importances_, index=featureList).sort_values(ascending=False) top_feature_imp = feature_imp[0:35] y_pos = np.arange(len(top_feature_imp)) plt.figure(figsize=(20, 10)) bars = plt.bar(y_pos, top_feature_imp, align='center', alpha=1, color=np.random.rand(len(top_feature_imp), 3)) for rect in bars: height = rect.get_height() plt.text(rect.get_x() + rect.get_width() / 2.0, height, '%.2f' % height, ha='center', va='bottom', fontsize=6) #features_names = [] #for ind in indices: # features_names.append(randomForestDF.columns[ind]) plt.xticks(y_pos, top_feature_imp.index, rotation='vertical', fontsize=8) plt.yticks(np.arange(min(feature_imp), max(feature_imp), step=0.01), fontsize=6) #plt.xticks(y_pos, names, fontsize=8) # Add labels to your graph plt.ylabel('Feature Importance Score') plt.xlabel('Features') plt.title("Visualizing Important Features") plt.legend() plt.subplots_adjust(bottom=0.4) datestr = constants.get_date_str() graphs_results_dir = constants.path_result_dir + datestr + constants.path_graphs_dir graph_file_name = "random_forest_" + datestr + ".png" out_dir_path = graphs_results_dir + constants.path_random_forest_graphs_dir if not os.path.exists(out_dir_path): os.makedirs(out_dir_path) plt.savefig(out_dir_path + graph_file_name) #plt.show() plt.close()