def update_dict(self, weighted_dict, sentance, y_SC, y_HW, y_SW): words = tokenize(sentance) # loop through the new words adjusting # their weights using the label for word in words: # extract the weight tuple weight = weighted_dict.get(word) # if new word we need non null kw if weight is None: weight = {'SC': 0, 'HW': 0, 'SW': 0, 'TOT': 0} # add the new labels to the prev total # note the labels are taken from the gui checkbox values which are 0 or 1 weight['SC'] += y_SC weight['HW'] += y_HW weight['SW'] += y_SW weight['TOT'] += 1 # store updated weight back in the weighted dictionary weighted_dict[word] = weight return weighted_dict
def get_weighted_dict(self, data_labeled): # new weighted_dict weighted_dict = {} # initialize the metadata self.weighted_dict['N_SC'] = 0 self.weighted_dict['N_HW'] = 0 self.weighted_dict['N_SW'] = 0 self.weighted_dict['N_SCHW'] = 0 self.weighted_dict['N_SCSW'] = 0 self.weighted_dict['N_HWSW'] = 0 self.weighted_dict['N_SCHWSW'] = 0 self.weighted_dict['N_OTHER'] = 0 self.weighted_dict['N_TOT'] = 0 # loop through the labeled dataset # recount the frequencies for the weighted_dict for _, data in data_labeled.iterrows(): # metadata calculations self.weighted_dict['N_TOT'] += 1 self.weighted_dict['N_SC'] += data.loc['SC'] self.weighted_dict['N_HW'] += data.loc['HW'] self.weighted_dict['N_SW'] += data.loc['SW'] if (data.loc['SC'] == 1 and data.loc['HW'] == 1): self.weighted_dict['N_SCHW'] += 1 if (data.loc['SC'] == 1 and data.loc['SW'] == 1): self.weighted_dict['N_SCSW'] += 1 if (data.loc['HW'] == 1 and data.loc['SW'] == 1): self.weighted_dict['N_HWSW'] += 1 if (data.loc['SC'] == 1 and data.loc['HW'] == 1 and data.loc['SW'] == 1): self.weighted_dict['N_SCHWSW'] += 1 if (data.loc['SC'] == 0 and data.loc['HW'] == 0 and data.loc['SW'] == 0): self.weighted_dict['N_OTHER'] += 1 # weight calculations sentance = tokenize(data.loc['MANUFACTURER_RECALL_REASON']) for word in sentance: if not word in weighted_dict: weighted_dict[word] = {'SC': 0, 'HW': 0, 'SW': 0, 'TOT': 0} weighted_dict[word]['SC'] += data.loc['SC'] weighted_dict[word]['HW'] += data.loc['HW'] weighted_dict[word]['SW'] += data.loc['SW'] weighted_dict[word]['TOT'] += 1 return weighted_dict
def clf_sentance(self, sentance): # tokenize the sentance words = tokenize(sentance) # initialize the classifications classifications = {'SC': 0, 'HW': 0, 'SW': 0} # for each class look for weighted_dict, # flag a classification on a keyword match # NOTE: weighted_dict is a list of tuples for word in words: for kw in self.keywords_SC: if(word == kw[0]): classifications['SC'] = 1 for kw in self.keywords_HW: if(word == kw[0]): classifications['HW'] = 1 for kw in self.keywords_SW: if(word == kw[0]): classifications['SW'] = 1 return classifications
def weight_sentance(self, sentance, weighted_dict): words_unclean = tokenize(sentance) words_SC = [ word for word in words_unclean if word not in self.my_stopwords_SC ] words_HW = [ word for word in words_unclean if word not in self.my_stopwords_HW ] words_SW = [ word for word in words_unclean if word not in self.my_stopwords_SW ] # sum the weights sample_weight = {'SC': 0.0, 'HW': 0.0, 'SW': 0.0} # use these to normalize the weights norm_SC = 0.0 norm_HW = 0.0 norm_SW = 0.0 # Security weightings for word in words_SC: # look up the frequencies in the table freqs = weighted_dict.get(word) # if the word is in the table if freqs is not None: # TODO: Not sure if referencing SW is helping if (freqs['SW']): sample_weight['SC'] += float(freqs['SC']) / freqs['SW'] else: sample_weight['SC'] += float(freqs['SC']) / freqs['TOT'] norm_SC += 1 # Hardware weightings for word in words_HW: # look up the frequencies in the table freqs = weighted_dict.get(word) # if the word is in the table if freqs is not None: sample_weight['HW'] += float(freqs['HW']) / freqs['TOT'] norm_HW += 1 # Software weightings for word in words_SW: # look up the frequencies in the table freqs = weighted_dict.get(word) # if the word is in the table if freqs is not None: sample_weight['SW'] += float(freqs['SW']) / freqs['TOT'] norm_SW += 1 # normalize if (norm_SC): sample_weight['SC'] /= norm_SC if (norm_HW): sample_weight['HW'] /= norm_HW if (norm_SW): sample_weight['SW'] /= norm_SW # Map to sigmoid if (self.alpha != 0): sample_weight['SC'] = sigmoid(sample_weight['SC'], alpha=self.alpha) sample_weight['HW'] = sigmoid(sample_weight['HW'], alpha=self.alpha) sample_weight['SW'] = sigmoid(sample_weight['SW'], alpha=self.alpha) return sample_weight
def term_freq_analysis_narrow(data_labeled_from_model_addr='data/recall_labeled_from_model.csv', data_labeled_addr='data/recall_labeled.csv'): software_kwoi = [ set(['anomaly']), set(['image', 'imaging']), set(['interface', 'gui']), set(['version', 'v']), set(['protocol', 'message']) ] hardware_kwoi = [ set(['defective', 'damaged']), set(['battery', 'power', 'charge', 'energy', 'voltage', 'charging', 'charger']), set(['board', 'circuit', 'capacitor', 'wiring', 'pcb']), set(['alarm']), set(['monitor', 'display']) ] security_kwoi = [ set(['error']), set(['sent', 'transfer', 'recieved']), set(['deleted', 'corrupted']), set(['anomaly']), set(['data', 'file', 'information', 'disk', 'archive', 'record']) ] # read both the samples labeled df_man = pd.read_csv(data_labeled_addr) df_auto = pd.read_csv(data_labeled_from_model_addr) # combine them data = pd.concat([df_man, df_auto], ignore_index=True) # determine number of years # it should be 17, from 2002 to 2018 years = data['YEAR'] years = years.drop_duplicates() numYears = years.shape[0] # seperate hist for each class # determing most popular words for each year # store in dict with year key hist_by_year_sc = {} hist_by_year_sw = {} hist_by_year_hw = {} for year in years: hist_by_year_sc[year] = [0, 0, 0, 0, 0] hist_by_year_sw[year] = [0, 0, 0, 0, 0] hist_by_year_hw[year] = [0, 0, 0, 0, 0] # go through each sample # update hist by year and class for i, row in data.iterrows(): # read the row year = row['YEAR'] sentance = row['MANUFACTURER_RECALL_REASON'] label_SC = row['SC'] label_SW = row['SW'] label_HW = row['HW'] wordSet = set(tokenize(sentance)) # for each class update the hist if label_SC == 1: # Read hist into local mem hist = hist_by_year_sc[year] # For each keyword of interest if there exists some non-zero overlap in words # increment the counter for that set for j, kwoi_j in enumerate(security_kwoi): if len(wordSet.intersection(kwoi_j)) > 0: hist[j]+=1 # Write back to main mem hist_by_year_sc[year] = hist if label_SW == 1: # Read hist into local mem hist = hist_by_year_sw[year] # For each keyword of interest if there exists some non-zero overlap in words # increment the counter for that set for j, kwoi_j in enumerate(software_kwoi): if len(wordSet.intersection(kwoi_j)) > 0: hist[j]+=1 # Write back to main mem hist_by_year_sw[year] = hist if label_HW == 1: # Read hist into local mem hist = hist_by_year_hw[year] # For each keyword of interest if there exists some non-zero overlap in words # increment the counter for that set for j, kwoi_j in enumerate(hardware_kwoi): if len(wordSet.intersection(kwoi_j)) > 0: hist[j]+=1 # Write back to main mem hist_by_year_hw[year] = hist # dataframes to be filled for each class and output of this function obj = {} for j, kwoi_j in enumerate(security_kwoi): wordStr = ",".join(kwoi_j) obj[wordStr] = [hist_by_year_sc[year][j] for year in years] df_sc = pd.DataFrame(obj, index=years) obj = {} for j, kwoi_j in enumerate(software_kwoi): wordStr = ",".join(kwoi_j) obj[wordStr] = [hist_by_year_sw[year][j] for year in years] df_sw = pd.DataFrame(obj, index=years) obj = {} for j, kwoi_j in enumerate(hardware_kwoi): wordStr = ",".join(kwoi_j) obj[wordStr] = [hist_by_year_hw[year][j] for year in years] df_hw = pd.DataFrame(obj, index=years) df_sc.to_csv('data/analysis/sc_termofinterest_freq.csv') df_sw.to_csv('data/analysis/sw_termofinterest_freq.csv') df_hw.to_csv('data/analysis/hw_termofinterest_freq.csv')
def term_freq_analysis(data_labeled_from_model_addr='data/recall_labeled_from_model.csv', data_labeled_addr='data/recall_labeled.csv'): software_kwoi = [ ['anomoly'], ['image', 'imaging'], ['interface', 'gui'], ['version', 'v'], ['protocol', 'message'] ] hardware_kwoi = [ ['defective', 'damaged'], ['battery', 'power', 'charge', 'energy', 'voltage', 'charging', 'charger'], ['board', 'circuit', 'capacitor', 'wiring', 'pcb'], ['alarm'], ['monitor', 'display'] ] security_kwoi = [ ['error'], ['sent', 'transfer', 'recieved'], ['deleted', 'corrupted'], ['anomaly'], ['data', 'file', 'information', 'disk', 'archive', 'record'] ] # read both the samples labeled df_man = pd.read_csv(data_labeled_addr) df_auto = pd.read_csv(data_labeled_from_model_addr) # combine them data = pd.concat([df_man, df_auto], ignore_index=True) # determine number of years # it should be 17, from 2002 to 2018 years = data['YEAR'] years = years.drop_duplicates() numYears = years.shape[0] # seperate hist for each class # determing most popular words for each year # store in dict with year key hist_global_sc = {} hist_global_sw = {} hist_global_hw = {} hist_by_year_sc = {} hist_by_year_sw = {} hist_by_year_hw = {} for year in years: hist_by_year_sc[year] = {} hist_by_year_sw[year] = {} hist_by_year_hw[year] = {} # go through each sample # update hist by year and class for i, row in data.iterrows(): # read the row year = row['YEAR'] sentance = row['MANUFACTURER_RECALL_REASON'] label_SC = row['SC'] label_SW = row['SW'] label_HW = row['HW'] words = tokenize(sentance) # for each class update the hist if label_SC == 1: hist = hist_by_year_sc[year] for word in words: if word in hist.keys(): hist[word] += 1 else: hist[word] = 1 if word in hist_global_sc: hist_global_sc[word] += 1 else: hist_global_sc[word] = 1 hist_by_year_sc[year] = hist if label_SW == 1: hist = hist_by_year_sw[year] for word in words: if word in hist.keys(): hist[word] += 1 else: hist[word] = 1 if word in hist_global_sw: hist_global_sw[word] += 1 else: hist_global_sw[word] = 1 hist_by_year_sw[year] = hist if label_HW == 1: hist = hist_by_year_hw[year] for word in words: if word in hist.keys(): hist[word] += 1 else: hist[word] = 1 if word in hist_global_hw: hist_global_hw[word] += 1 else: hist_global_hw[word] = 1 hist_by_year_hw[year] = hist # dataframes to be filled for each class and output of this function df_sc = pd.DataFrame({ 'words': [0 for year in years] }, index=years) df_sw = pd.DataFrame({ 'words': [0 for year in years] }, index=years) df_hw = pd.DataFrame({ 'words': [0 for year in years] }, index=years) # for each year determine top 10 keywords top_keywords_by_year_sc = {} top_keywords_by_year_sw = {} top_keywords_by_year_hw = {} for year in years: # initialize the top 10 top_keywords_sc = [('', 0) for i in range(10)] top_keywords_sw = [('', 0) for i in range(10)] top_keywords_hw = [('', 0) for i in range(10)] # grab the histograms hist_sc = hist_by_year_sc[year] hist_sw = hist_by_year_sw[year] hist_hw = hist_by_year_hw[year] # Sort SC # for word in hist_sc: count = hist_sc[word] # compare the current word to the words on the list for i, (keyword, keycount) in enumerate(top_keywords_sc): # if the count is greater, then push all the other words back # store this one # then break if count >= keycount: for j in range(9, i, -1): top_keywords_sc[j] = top_keywords_sc[j-1] top_keywords_sc[i] = (word, count) break # Sort SW # for word in hist_sw: count = hist_sw[word] # compare the current word to the words on the list for i, (keyword, keycount) in enumerate(top_keywords_sw): # if the count is greater, then push all the other words back # store this one # then break if count >= keycount: for j in range(9, i, -1): top_keywords_sw[j] = top_keywords_sw[j-1] top_keywords_sw[i] = (word, count) break # Sort HW # for word in hist_hw: count = hist_hw[word] # compare the current word to the words on the list for i, (keyword, keycount) in enumerate(top_keywords_hw): # if the count is greater, then push all the other words back # store this one # then break if count >= keycount: for j in range(9, i, -1): top_keywords_hw[j] = top_keywords_hw[j-1] top_keywords_hw[i] = (word, count) break # store in datastructure for this year top_keywords_by_year_sc[year] = top_keywords_sc top_keywords_by_year_sw[year] = top_keywords_sw top_keywords_by_year_hw[year] = top_keywords_hw df_sc['words'][year] = ", ".join([word for (word, count) in top_keywords_sc]) df_sw['words'][year] = ", ".join([word for (word, count) in top_keywords_sw]) df_hw['words'][year] = ", ".join([word for (word, count) in top_keywords_hw]) df_sc.to_csv('data/analysis/sc_term_freq.csv') df_sw.to_csv('data/analysis/sw_term_freq.csv') df_hw.to_csv('data/analysis/hw_term_freq.csv') # create and sort a list of keywords with freqs top_keywords_global_sc = [(word, hist_global_sc[word]) for word in hist_global_sc.keys()] top_keywords_global_sw = [(word, hist_global_sw[word]) for word in hist_global_sw.keys()] top_keywords_global_hw = [(word, hist_global_hw[word]) for word in hist_global_hw.keys()] top_keywords_global_sc.sort(key=lambda x: x[1], reverse=True) top_keywords_global_sw.sort(key=lambda x: x[1], reverse=True) top_keywords_global_hw.sort(key=lambda x: x[1], reverse=True) # print them out to a local csv file strOut_sc = '' for (word, count) in top_keywords_global_sc: strOut_sc += word + ', ' + str(count) + '\n' with open('data/analysis/sc_global_keyword_hist.csv', mode='w') as file: file.write(strOut_sc) strOut_sw = '' for (word, count) in top_keywords_global_sw: strOut_sw += word + ', ' + str(count) + '\n' with open('data/analysis/sw_global_keyword_hist.csv', mode='w') as file: file.write(strOut_sw) strOut_hw = '' for (word, count) in top_keywords_global_hw: strOut_hw += word + ', ' + str(count) + '\n' with open('data/analysis/hw_global_keyword_hist.csv', mode='w') as file: file.write(strOut_hw)