Python remove_stopwords示例，helper.remove_stopwords Python示例

示例#1

0

显示文件

def cyber_object_miner(file_name):
    sents = readfile(file_name)
    cyber_objects_words = list()
    cyber_objects_arg1 = list()
    action_verbs = list()
    for sentence in sents:
        extraction_list = list()
        try:
            predict = configuration.model_AllenNLP_SRL.predict(sentence=sentence)
            if len(predict['verbs']) == 0:
                # print('AllenNLP can not extract: ', sentence)
                cyber_objects_words.append(helper.remove_stopwords(sentence))
            else:
                for extractions in predict['verbs']:
                    verb = extractions['verb']
                    # checking from the verb_list
                    # print(verb)
                    action_verbs.append(verb)
                    # if helper.is_dictionay_key(configuration.preprocessOntologies.verb_dict,
                    #                            configuration.stemmer.stem(verb)):
                    description = extractions['description']
                    # print('description:',description)
                    # # print(description)
                    single_extraction_dict = AllenNLP.process_single_AllenNLP_description(description)
                    extraction_list.append(single_extraction_dict)
                extraction_list = AllenNLP.analyze_whole_sentence(extraction_list)
                extraction_list = AllenNLP.delete_extra_args(extraction_list)
                # print(single_extraction_dict)
                for single_extraction_dict in extraction_list:
                    try:
                        cyber_objects_arg1.append(helper.remove_stopwords(single_extraction_dict['where']))
                    except:
                        pass
                        # print('ARG1 not predent')
        except:
            pass

    return cyber_objects_arg1, cyber_objects_words, action_verbs

示例#2

0

显示文件

文件： main.py 项目： you0522/Movie_Review

def main():
    input_size = 10000
    embedding_size = 24
    output_size = 5
    learning_rate = 0.01
    oov_token = '<OOV>'
    loss = 'sparse_categorical_crossentropy'
    optimizer = Adam(learning_rate=learning_rate)
    epochs = 1
    train_val_split = 0.2

    sentences, sentiments = helper.get_data('data/train.tsv')
    sentences = helper.remove_stopwords(sentences, 'data/stopwords')

    max_length = len(max(sentences, key=len))

    tokenizer = helper.get_tokenizer(input_size, oov_token, sentences)

    padded_sentences = helper.convert_to_sequences(tokenizer, sentences,
                                                   max_length)

    train_padded_sentences, validation_padded_sentences, train_sentiments, validation_sentiments = \
        train_test_split(
            padded_sentences, sentiments, test_size=train_val_split, random_state=42
        )

    train_padded_sentences = np.array(train_padded_sentences)
    train_sentiments = np.array(train_sentiments)
    validation_padded_sentences = np.array(validation_padded_sentences)
    validation_sentiments = np.array(validation_sentiments)

    layers = [
        tf.keras.layers.Embedding(input_size,
                                  embedding_size,
                                  input_length=max_length),
        # tf.keras.layers.LSTM(32),

        # tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
        # tf.keras.layers.MaxPooling1D(pool_size=4),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(units=24, activation='relu'),
        tf.keras.layers.Dense(units=output_size, activation='softmax')
    ]

    model = MODEL(input_size, output_size, layers, loss, optimizer, epochs)
    model.__train__(train_padded_sentences, train_sentiments,
                    validation_padded_sentences, validation_sentiments)
    model.__plot_graph__('accuracy')

示例#3

0

显示文件

def combine_cyber_object(files=cyber_object_files):
    text = ''
    for file in files:
        text += helper.read_file(file) + '\n'
    text = text.split('\n')
    dict_cyber = dict()
    for tt in text:
        tt = tt.translate(str.maketrans('', '', '!"#$%&\()*+,.:;<=>?@[\\]^_`{|}~')).lower()
        key = helper.stem_and_lemmatize(tt, isRemoveStopword=True)
        dict_cyber[key] = helper.remove_stopwords(tt)
    what_list = list(set(list(dict_cyber.values())))
    what_list.sort()
    # for __ in what_list:
    #     print(__)
    # print(len(what_list))
    helper.write_file(output_file, what_list)
    return

示例#4

0

显示文件

文件： ontology_reader.py 项目： mpurba1/TTPDrill-0.5

def read_action_what(file_name='action_what.txt'):
    text = helper.read_file(file_name)
    text = text.split('\n')
    dict_cyber = dict()
    for tt in text:
        tt = tt.translate(
            str.maketrans('', '', '!"#$%&\()*+,.:;<=>?@[\\]^_`{|}~')).lower()

        #     print(dict__)
        #     print('tt:',tt)
        #     if tt == '':
        #         continue
        #     stemmed = configuration.stemmer.stem(configuration.lemmatizer.lemmatize(tt)).lower()
        #     print(stemmed)
        #     if stemmed in dict__.keys():
        #         dict__[stemmed] = dict__[stemmed] + ' ' + tt
        #     else:
        #         dict__[stemmed] = tt + ' '
        #     what_list.append(configuration.stemmer.stem(configuration.lemmatizer.lemmatize(tt)).lower())
        key = helper.stem_and_lemmatize(tt, isRemoveStopword=True)
        dict_cyber[key] = helper.remove_stopwords(tt)
    what_list = list(set(list(dict_cyber.values())))
    what_list.sort()
    for __ in what_list:
        print(__)
    print(len(what_list))
    helper.write_file('ontology/temp.txt', what_list)
    # with open('ontology/all_cyber_list_3.txt','w', encoding='utf-8') as f:
    #     keys = list(dict_cyber.values())
    #     keys.sort()
    #     for tt in keys:
    #         # f.write(tt + ' - ' + dict__[tt] + '\n')
    #         f.write(tt + '\n')
    #         print(tt)
    #     # print(configuration.stemmer.stem(tt))
    return what_list

示例#5

0

显示文件

def analyze_single_extraction(extracted_dictionary):
    if len(extracted_dictionary) <= 1:
        return False, create_attack_vector()
    args_list = list(extracted_dictionary.keys())
    result_list = list()
    # # print('dict_list', dict_list)
    compact_attack_vector = create_attack_vector()

    # check for negetive sentence
    isAttributePresent = all(elem in args_list for elem in ['ARGM-NEG'])
    if isAttributePresent:
        compact_attack_vector['what'] = 'ARGM-NEG'
        return False, compact_attack_vector

    # Without subject, remove
    # if 'ARG0' not in list(args_list):
    #     return False, create_attack_vector()

    # Subject tracking
    if 'ARG0' in args_list and extracted_dictionary['ARG0'].strip() == 'we':
        # print('Skipped because no ARG0 extracted')
        return False, compact_attack_vector

    # Check for V and ARG1
    isAttributePresent = all(elem in args_list for elem in ['V', 'ARG1'])
    if isAttributePresent:
        # Check for preposition in ARG1, cyber_object
        compact_attack_vector['what'] = helper.remove_stopwords(
            extracted_dictionary['V'])
        if ('itself' in extracted_dictionary['ARG1'].split()
                or 'it' in extracted_dictionary['ARG1'].split()
            ) and 'ARG0' in args_list:
            compact_attack_vector['where'] = helper.remove_stopwords(
                extracted_dictionary['ARG0'])
        else:
            compact_attack_vector['where'] = helper.remove_stopwords(
                extracted_dictionary['ARG1'])

    # Check for purpose
    isAttributePresent = all(elem in args_list for elem in ['ARGM-PRP'])
    isWhyFound = True
    if isAttributePresent:
        compact_attack_vector['why'] = helper.remove_stopwords(
            extracted_dictionary['ARGM-PRP'])
        isWhyFound = False

    # Need to add propbank extraction
    isAttributePresent = all(elem in args_list for elem in ['ARG2'])
    if isAttributePresent and isWhyFound:
        if 'to' in extracted_dictionary['ARG2'].split():
            compact_attack_vector['why'] = helper.remove_stopwords(
                extracted_dictionary['ARG2'])

    # Check for manner --> how
    isAttributePresent = all(elem in args_list for elem in ['ARGM-MNR'])
    if isAttributePresent:
        compact_attack_vector['how'] = helper.remove_stopwords(
            extracted_dictionary['ARGM-MNR'])

    isAttributePresent = all(elem in args_list for elem in ['ARGM-TMP'])
    if isAttributePresent:
        compact_attack_vector['when'] = extracted_dictionary['ARGM-TMP']
        # compact_attack_vector['when'] = helper.remove_stopwords(extracted_dictionary['ARGM-TMP'])
        # if len(compact_attack_vector['when']) == 0:
        #     return False, compact_attack_vector

    return True, compact_attack_vector

示例#6

0

显示文件

    def get_important_relations(self, dep_tree, sentence):
        extracted_words = dict()
        what_bagofwords = set()
        where_bagofwords = set()
        where_attribute_bagofwords = set()
        how_bagofwords = set()
        why_bagofwords = set()
        when_bagofwords = set()
        subject_bagofwords = set()
        action_bagofwords = set()

        for node in dep_tree[0]:
            #            print(node)
            self.get_relation(node, 'dobj', what_bagofwords, where_bagofwords)
            # if node[0] == 'dobj':
            #   action_bagofwords.add(verb+" "+obj)

            self.get_relation(node, 'nsubj', what_bagofwords,
                              subject_bagofwords)

            self.get_relation(node, 'nmod:on', what_bagofwords,
                              where_attribute_bagofwords)

            self.get_relation(node, 'nmod:in', where_attribute_bagofwords,
                              where_attribute_bagofwords)

            self.get_relation(node, 'advcl:to', what_bagofwords,
                              why_bagofwords)

            self.get_relation(node, 'compound', where_bagofwords,
                              where_bagofwords)

            self.get_relation(node, 'nsubjpass', where_bagofwords,
                              where_bagofwords)

            self.get_relation(node, 'nmod:agent', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:from', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:to', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:with', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:via', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:over', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:for', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:via', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:through', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:using', where_bagofwords,
                              where_bagofwords)
            self.get_relation(node, 'nmod:into', where_bagofwords,
                              where_bagofwords)

        #        what_bafofwords.append(verb)
        #        where_bagofwords.append(obj)
        extracted_words['what'] = helper.remove_stopwords(what_bagofwords)
        extracted_words['where'] = helper.remove_stopwords(where_bagofwords)
        extracted_words['where_attribute'] = helper.remove_stopwords(
            where_attribute_bagofwords)
        extracted_words['why'] = helper.remove_stopwords(why_bagofwords)
        extracted_words['when'] = helper.remove_stopwords(when_bagofwords)
        extracted_words['how'] = helper.remove_stopwords(how_bagofwords)
        extracted_words['subject'] = helper.remove_stopwords(
            subject_bagofwords)
        extracted_words['action'] = helper.remove_stopwords(action_bagofwords)
        extracted_words['text'] = sentence

        return extracted_words

示例#7

0

显示文件

文件： nlp_practice.py 项目： weilbach/spam-ham-classifier

#START DATA PREPROCESSING

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(' '))

data['punct%'] = data['body_text'].apply(lambda x: count_punctuation(x))

data['body_text_clean'] = data['body_text'].apply(
    lambda x: remove_punctuation(x))

data['body_text_tokenized'] = data['body_text_clean'].apply(
    lambda x: tokenize(x))

stopwords = nltk.corpus.stopwords.words('english')

data['body_text_nonstop'] = data['body_text_tokenized'].apply(
    lambda x: remove_stopwords(x, stopwords))

stemmer = nltk.PorterStemmer()

data['body_text_stemmed'] = data['body_text_nonstop'].apply(
    lambda x: stemming(x, stemmer))

wn = nltk.WordNetLemmatizer()

data['body_text_lemmatized'] = data['body_text_nonstop'].apply(
    lambda x: lemmatizing(x, wn))

#END DATA PREPROCESSING

#BEGINNING VECTORIZATION OF DATA

示例#8

0

显示文件

data = documents.copy()
# Removes phrases with @ in them
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Truncates multiple consecutive whitespace to one
data = [re.sub('\s+', ' ', sent) for sent in data]
# Removes ' characters
data = [re.sub("\'", "", sent) for sent in data]

data_words = list(he.sent_to_words(data))
print('Building Bigrams')
# Making Bigrams - Higher the threshold, fewer the phrases
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
print('Removing Stopwords')
# Remove Stop Words
data_words_nostops = he.remove_stopwords(data_words, stop_words)
print('Forming Bigrams')
# Form Bigrams
data_words_bigrams = he.make_bigrams(data_words_nostops, bigram_mod)
print('Lemmatizing Data')
# Lemmatize Data
data_lemmatized = he.lemmatization(
    data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# The keep_n parameter controls the size of the vocabulary.
# At this stage, we have to manually experiment with various vocabulary sizes to see what works best.
# I found that ~8-10% of the number of documents is a good size.
# For Digital India, I used vocab size of 1000 (12412 documents).
# For GST, I used a vocab size of 1500 (15k documents approx)

print('Creating Dictionary')