Python tokenize примеры, Util.tokenize Python примеры использования

Пример #1

0

Показать файл

Файл: WeightedDictionaryClf.py Проект: SeaBass917/label-maker

    def update_dict(self, weighted_dict, sentance, y_SC, y_HW, y_SW):

        words = tokenize(sentance)

        # loop through the new words adjusting
        # their weights using the label
        for word in words:

            # extract the weight tuple
            weight = weighted_dict.get(word)

            # if new word we need non null kw
            if weight is None:
                weight = {'SC': 0, 'HW': 0, 'SW': 0, 'TOT': 0}

            # add the new labels to the prev total
            # note the labels are taken from the gui checkbox values which are 0 or 1
            weight['SC'] += y_SC
            weight['HW'] += y_HW
            weight['SW'] += y_SW
            weight['TOT'] += 1

            # store updated weight back in the weighted dictionary
            weighted_dict[word] = weight

        return weighted_dict

Пример #2

0

Показать файл

Файл: WeightedDictionaryClf.py Проект: SeaBass917/label-maker

    def get_weighted_dict(self, data_labeled):

        # new weighted_dict
        weighted_dict = {}

        # initialize the metadata
        self.weighted_dict['N_SC'] = 0
        self.weighted_dict['N_HW'] = 0
        self.weighted_dict['N_SW'] = 0
        self.weighted_dict['N_SCHW'] = 0
        self.weighted_dict['N_SCSW'] = 0
        self.weighted_dict['N_HWSW'] = 0
        self.weighted_dict['N_SCHWSW'] = 0
        self.weighted_dict['N_OTHER'] = 0
        self.weighted_dict['N_TOT'] = 0

        # loop through the labeled dataset
        # recount the frequencies for the weighted_dict
        for _, data in data_labeled.iterrows():

            # metadata calculations
            self.weighted_dict['N_TOT'] += 1
            self.weighted_dict['N_SC'] += data.loc['SC']
            self.weighted_dict['N_HW'] += data.loc['HW']
            self.weighted_dict['N_SW'] += data.loc['SW']
            if (data.loc['SC'] == 1 and data.loc['HW'] == 1):
                self.weighted_dict['N_SCHW'] += 1
            if (data.loc['SC'] == 1 and data.loc['SW'] == 1):
                self.weighted_dict['N_SCSW'] += 1
            if (data.loc['HW'] == 1 and data.loc['SW'] == 1):
                self.weighted_dict['N_HWSW'] += 1
            if (data.loc['SC'] == 1 and data.loc['HW'] == 1
                    and data.loc['SW'] == 1):
                self.weighted_dict['N_SCHWSW'] += 1
            if (data.loc['SC'] == 0 and data.loc['HW'] == 0
                    and data.loc['SW'] == 0):
                self.weighted_dict['N_OTHER'] += 1

            # weight calculations
            sentance = tokenize(data.loc['MANUFACTURER_RECALL_REASON'])
            for word in sentance:
                if not word in weighted_dict:
                    weighted_dict[word] = {'SC': 0, 'HW': 0, 'SW': 0, 'TOT': 0}

                weighted_dict[word]['SC'] += data.loc['SC']
                weighted_dict[word]['HW'] += data.loc['HW']
                weighted_dict[word]['SW'] += data.loc['SW']
                weighted_dict[word]['TOT'] += 1

        return weighted_dict

Пример #3

0

Показать файл

    def clf_sentance(self, sentance):

        # tokenize the sentance
        words = tokenize(sentance)

        # initialize the classifications
        classifications = {'SC': 0, 'HW': 0, 'SW': 0}

        # for each class look for weighted_dict, 
        # flag a classification on a keyword match
        # NOTE: weighted_dict is a list of tuples
        for word in words:
            for kw in self.keywords_SC:
                if(word == kw[0]):
                    classifications['SC'] = 1
            for kw in self.keywords_HW:
                if(word == kw[0]):
                    classifications['HW'] = 1
            for kw in self.keywords_SW:
                if(word == kw[0]):
                    classifications['SW'] = 1

        return classifications

Пример #4

0

Показать файл

Файл: WeightedDictionaryClf.py Проект: SeaBass917/label-maker

    def weight_sentance(self, sentance, weighted_dict):

        words_unclean = tokenize(sentance)

        words_SC = [
            word for word in words_unclean if word not in self.my_stopwords_SC
        ]
        words_HW = [
            word for word in words_unclean if word not in self.my_stopwords_HW
        ]
        words_SW = [
            word for word in words_unclean if word not in self.my_stopwords_SW
        ]

        # sum the weights
        sample_weight = {'SC': 0.0, 'HW': 0.0, 'SW': 0.0}

        # use these to normalize the weights
        norm_SC = 0.0
        norm_HW = 0.0
        norm_SW = 0.0

        # Security weightings
        for word in words_SC:

            # look up the frequencies in the table
            freqs = weighted_dict.get(word)

            # if the word is in the table
            if freqs is not None:

                # TODO: Not sure if referencing SW is helping
                if (freqs['SW']):
                    sample_weight['SC'] += float(freqs['SC']) / freqs['SW']
                else:
                    sample_weight['SC'] += float(freqs['SC']) / freqs['TOT']
                norm_SC += 1

        # Hardware weightings
        for word in words_HW:

            # look up the frequencies in the table
            freqs = weighted_dict.get(word)

            # if the word is in the table
            if freqs is not None:
                sample_weight['HW'] += float(freqs['HW']) / freqs['TOT']
                norm_HW += 1

        # Software weightings
        for word in words_SW:

            # look up the frequencies in the table
            freqs = weighted_dict.get(word)

            # if the word is in the table
            if freqs is not None:

                sample_weight['SW'] += float(freqs['SW']) / freqs['TOT']
                norm_SW += 1

        # normalize
        if (norm_SC):
            sample_weight['SC'] /= norm_SC
        if (norm_HW):
            sample_weight['HW'] /= norm_HW
        if (norm_SW):
            sample_weight['SW'] /= norm_SW

        # Map to sigmoid
        if (self.alpha != 0):
            sample_weight['SC'] = sigmoid(sample_weight['SC'],
                                          alpha=self.alpha)
            sample_weight['HW'] = sigmoid(sample_weight['HW'],
                                          alpha=self.alpha)
            sample_weight['SW'] = sigmoid(sample_weight['SW'],
                                          alpha=self.alpha)

        return sample_weight

Пример #5

0

Показать файл

def term_freq_analysis_narrow(data_labeled_from_model_addr='data/recall_labeled_from_model.csv',
                        data_labeled_addr='data/recall_labeled.csv'):

    software_kwoi = [
        set(['anomaly']),
        set(['image', 'imaging']),
        set(['interface', 'gui']),
        set(['version', 'v']),
        set(['protocol', 'message'])
    ]
    hardware_kwoi = [
        set(['defective', 'damaged']),
        set(['battery', 'power', 'charge', 'energy', 'voltage', 'charging', 'charger']),
        set(['board', 'circuit', 'capacitor', 'wiring', 'pcb']),
        set(['alarm']),
        set(['monitor', 'display'])
    ]
    security_kwoi = [
        set(['error']),
        set(['sent', 'transfer', 'recieved']),
        set(['deleted', 'corrupted']),
        set(['anomaly']),
        set(['data', 'file', 'information', 'disk', 'archive', 'record'])
    ]

    # read both the samples labeled
    df_man = pd.read_csv(data_labeled_addr)
    df_auto = pd.read_csv(data_labeled_from_model_addr)

    # combine them
    data = pd.concat([df_man, df_auto], ignore_index=True)

    # determine number of years
    # it should be 17, from 2002 to 2018
    years = data['YEAR']
    years = years.drop_duplicates()
    numYears = years.shape[0]

    # seperate hist for each class
    # determing most popular words for each year
    # store in dict with year key
    hist_by_year_sc = {}
    hist_by_year_sw = {}
    hist_by_year_hw = {}
    for year in years:
        hist_by_year_sc[year] = [0, 0, 0, 0, 0]
        hist_by_year_sw[year] = [0, 0, 0, 0, 0]
        hist_by_year_hw[year] = [0, 0, 0, 0, 0]

    # go through each sample
    # update hist by year and class
    for i, row in data.iterrows():

        # read the row
        year = row['YEAR']
        sentance = row['MANUFACTURER_RECALL_REASON']
        label_SC = row['SC']
        label_SW = row['SW']
        label_HW = row['HW']

        wordSet = set(tokenize(sentance))

        # for each class update the hist
        if label_SC == 1:

            # Read hist into local mem
            hist = hist_by_year_sc[year]

            # For each keyword of interest if there exists some non-zero overlap in words 
            # increment the counter for that set
            for j, kwoi_j in enumerate(security_kwoi):
                if len(wordSet.intersection(kwoi_j)) > 0:
                    hist[j]+=1

            # Write back to main mem
            hist_by_year_sc[year] = hist
            
        if label_SW == 1:

            # Read hist into local mem
            hist = hist_by_year_sw[year]

            # For each keyword of interest if there exists some non-zero overlap in words 
            # increment the counter for that set
            for j, kwoi_j in enumerate(software_kwoi):
                if len(wordSet.intersection(kwoi_j)) > 0:
                    hist[j]+=1

            # Write back to main mem
            hist_by_year_sw[year] = hist
            
        if label_HW == 1:

            # Read hist into local mem
            hist = hist_by_year_hw[year]

            # For each keyword of interest if there exists some non-zero overlap in words 
            # increment the counter for that set
            for j, kwoi_j in enumerate(hardware_kwoi):
                if len(wordSet.intersection(kwoi_j)) > 0:
                    hist[j]+=1

            # Write back to main mem
            hist_by_year_hw[year] = hist
            
    # dataframes to be filled for each class and output of this function
    obj = {}
    for j, kwoi_j in enumerate(security_kwoi):
        wordStr = ",".join(kwoi_j)
        obj[wordStr] = [hist_by_year_sc[year][j] for year in years]
    df_sc = pd.DataFrame(obj, index=years)

    obj = {}
    for j, kwoi_j in enumerate(software_kwoi):
        wordStr = ",".join(kwoi_j)
        obj[wordStr] = [hist_by_year_sw[year][j] for year in years]
    df_sw = pd.DataFrame(obj, index=years)

    obj = {}
    for j, kwoi_j in enumerate(hardware_kwoi):
        wordStr = ",".join(kwoi_j)
        obj[wordStr] = [hist_by_year_hw[year][j] for year in years]
    df_hw = pd.DataFrame(obj, index=years)

    df_sc.to_csv('data/analysis/sc_termofinterest_freq.csv')
    df_sw.to_csv('data/analysis/sw_termofinterest_freq.csv')
    df_hw.to_csv('data/analysis/hw_termofinterest_freq.csv')

Пример #6

0

Показать файл

def term_freq_analysis(data_labeled_from_model_addr='data/recall_labeled_from_model.csv',
                        data_labeled_addr='data/recall_labeled.csv'):

    software_kwoi = [
        ['anomoly'],
        ['image', 'imaging'],
        ['interface', 'gui'],
        ['version', 'v'],
        ['protocol', 'message']
    ]
    hardware_kwoi = [
        ['defective', 'damaged'],
        ['battery', 'power', 'charge', 'energy', 'voltage', 'charging', 'charger'],
        ['board', 'circuit', 'capacitor', 'wiring', 'pcb'],
        ['alarm'],
        ['monitor', 'display']
    ]
    security_kwoi = [
        ['error'],
        ['sent', 'transfer', 'recieved'],
        ['deleted', 'corrupted'],
        ['anomaly'],
        ['data', 'file', 'information', 'disk', 'archive', 'record']
    ]

    # read both the samples labeled
    df_man = pd.read_csv(data_labeled_addr)
    df_auto = pd.read_csv(data_labeled_from_model_addr)

    # combine them
    data = pd.concat([df_man, df_auto], ignore_index=True)

    # determine number of years
    # it should be 17, from 2002 to 2018
    years = data['YEAR']
    years = years.drop_duplicates()
    numYears = years.shape[0]

    # seperate hist for each class
    # determing most popular words for each year
    # store in dict with year key
    hist_global_sc = {}
    hist_global_sw = {}
    hist_global_hw = {}
    hist_by_year_sc = {}
    hist_by_year_sw = {}
    hist_by_year_hw = {}
    for year in years:
        hist_by_year_sc[year] = {}
        hist_by_year_sw[year] = {}
        hist_by_year_hw[year] = {}

    # go through each sample
    # update hist by year and class
    for i, row in data.iterrows():

        # read the row
        year = row['YEAR']
        sentance = row['MANUFACTURER_RECALL_REASON']
        label_SC = row['SC']
        label_SW = row['SW']
        label_HW = row['HW']

        words = tokenize(sentance)

        # for each class update the hist
        if label_SC == 1:
            hist = hist_by_year_sc[year]
            
            for word in words:
                if word in hist.keys():
                    hist[word] += 1
                else:
                    hist[word] = 1
                
                if word in hist_global_sc:
                    hist_global_sc[word] += 1
                else:
                    hist_global_sc[word] = 1

            hist_by_year_sc[year] = hist
            
        if label_SW == 1:
            hist = hist_by_year_sw[year]
            
            for word in words:
                if word in hist.keys():
                    hist[word] += 1
                else:
                    hist[word] = 1
                
                if word in hist_global_sw:
                    hist_global_sw[word] += 1
                else:
                    hist_global_sw[word] = 1

            hist_by_year_sw[year] = hist
            
        if label_HW == 1:
            hist = hist_by_year_hw[year]
            
            for word in words:
                if word in hist.keys():
                    hist[word] += 1
                else:
                    hist[word] = 1
                
                if word in hist_global_hw:
                    hist_global_hw[word] += 1
                else:
                    hist_global_hw[word] = 1

            hist_by_year_hw[year] = hist
            
    # dataframes to be filled for each class and output of this function
    df_sc = pd.DataFrame({
        'words': [0 for year in years]
    }, index=years)

    df_sw = pd.DataFrame({
        'words': [0 for year in years]
    }, index=years)

    df_hw = pd.DataFrame({
        'words': [0 for year in years]
    }, index=years)

    # for each year determine top 10 keywords
    top_keywords_by_year_sc = {}
    top_keywords_by_year_sw = {}
    top_keywords_by_year_hw = {}
    for year in years:
        
        # initialize the top 10
        top_keywords_sc = [('', 0) for i in range(10)]
        top_keywords_sw = [('', 0) for i in range(10)]
        top_keywords_hw = [('', 0) for i in range(10)]

        # grab the histograms
        hist_sc = hist_by_year_sc[year]
        hist_sw = hist_by_year_sw[year]
        hist_hw = hist_by_year_hw[year]

        # Sort SC #
        for word in hist_sc:
            count = hist_sc[word]

            # compare the current word to the words on the list
            for i, (keyword, keycount) in enumerate(top_keywords_sc):
                
                # if the count is greater, then push all the other words back 
                # store this one
                # then break
                if count >= keycount:
                    for j in range(9, i, -1):
                        top_keywords_sc[j] = top_keywords_sc[j-1]
                    
                    top_keywords_sc[i] = (word, count)

                    break

        # Sort SW #
        for word in hist_sw:
            count = hist_sw[word]

            # compare the current word to the words on the list
            for i, (keyword, keycount) in enumerate(top_keywords_sw):
                
                # if the count is greater, then push all the other words back 
                # store this one
                # then break
                if count >= keycount:
                    for j in range(9, i, -1):
                        top_keywords_sw[j] = top_keywords_sw[j-1]
                    
                    top_keywords_sw[i] = (word, count)

                    break

        # Sort HW #
        for word in hist_hw:
            count = hist_hw[word]

            # compare the current word to the words on the list
            for i, (keyword, keycount) in enumerate(top_keywords_hw):
                
                # if the count is greater, then push all the other words back 
                # store this one
                # then break
                if count >= keycount:
                    for j in range(9, i, -1):
                        top_keywords_hw[j] = top_keywords_hw[j-1]
                    
                    top_keywords_hw[i] = (word, count)

                    break

        # store in datastructure for this year
        top_keywords_by_year_sc[year] = top_keywords_sc
        top_keywords_by_year_sw[year] = top_keywords_sw
        top_keywords_by_year_hw[year] = top_keywords_hw

        df_sc['words'][year] = ", ".join([word for (word, count) in top_keywords_sc])
        df_sw['words'][year] = ", ".join([word for (word, count) in top_keywords_sw])
        df_hw['words'][year] = ", ".join([word for (word, count) in top_keywords_hw])

    df_sc.to_csv('data/analysis/sc_term_freq.csv')
    df_sw.to_csv('data/analysis/sw_term_freq.csv')
    df_hw.to_csv('data/analysis/hw_term_freq.csv')

    # create and sort a list of keywords with freqs
    top_keywords_global_sc = [(word, hist_global_sc[word]) for word in hist_global_sc.keys()]
    top_keywords_global_sw = [(word, hist_global_sw[word]) for word in hist_global_sw.keys()]
    top_keywords_global_hw = [(word, hist_global_hw[word]) for word in hist_global_hw.keys()]

    top_keywords_global_sc.sort(key=lambda x: x[1], reverse=True)
    top_keywords_global_sw.sort(key=lambda x: x[1], reverse=True)
    top_keywords_global_hw.sort(key=lambda x: x[1], reverse=True)

    # print them out to a local csv file
    strOut_sc = ''
    for (word, count) in top_keywords_global_sc:
        strOut_sc += word + ', ' + str(count) + '\n'
    with open('data/analysis/sc_global_keyword_hist.csv', mode='w') as file:
        file.write(strOut_sc)

    strOut_sw = ''
    for (word, count) in top_keywords_global_sw:
        strOut_sw += word + ', ' + str(count) + '\n'
    with open('data/analysis/sw_global_keyword_hist.csv', mode='w') as file:
        file.write(strOut_sw)

    strOut_hw = ''
    for (word, count) in top_keywords_global_hw:
        strOut_hw += word + ', ' + str(count) + '\n'
    with open('data/analysis/hw_global_keyword_hist.csv', mode='w') as file:
        file.write(strOut_hw)

Python tokenize примеры использования