Python STOP_WORDS示例，spacy.lang.en.stop_words.STOP_WORDS Python示例

示例#1

0

显示文件

文件： debate.py 项目： seakarki/debate_word_cloud

def debate_text_process(text):
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(str(text))
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    from spacy.lang.en.stop_words import STOP_WORDS
    stop_words = set(stopwords.words('english'))

    STOP_WORDS.update(stop_words)
    STOP_WORDS.update({
        'nt', 'okay', 'ha', 'thank', 'wa', 'got', 'oh', 'said', 'going',
        'want', 'let', 'know'
    })
    words = [w for w in words if not w in STOP_WORDS]
    #print(len(STOP_WORDS))

    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(w) for w in words]
    return words

示例#2

0

显示文件

文件： preprocess.py 项目： whatevery1says/tweet-suite

 def load_custom_stoplist(self, stoplist_file):
     """Load custom stoplist."""
     with open(stoplist_file, 'r') as f:
         stoplist = f.read().split('\n')
     for item in stoplist:
         STOP_WORDS.add(item)
         self.nlp.vocab[item].is_stop = True

示例#3

0

显示文件

    def _collect_words(self):
        """Collects all the unique word and pos_tag pairs from the text."""
        nlp = spacy.load("en_core_web_lg")
        # coref = NeuralCoref(nlp.vocab)
        # nlp.add_pipe(coref, name='neuralcoref')

        print("Preparing Spacy object")
        nlp.max_length = len(self.text)
        text_obj = nlp(str(self.text.lower()), disable=['NER'])

        print("Preparing Spacy object")
        # Resolve co-reference using neuralcoref
        # self.text = text_obj._.coref_resolved
        # nlp.remove_pipe("neuralcoref")
        # text_obj = nlp(str(self.text.lower()), disable=['NER'])

        prev_sent = Sentence(nlp(''), None)
        words = {}
        STOP_WORDS.add('_')
        logging.info("Collecting words")
        for sent in tqdm(text_obj.sents):
            # sent = nlp(Sentence.clean_sentence(sent.text))
            curr_sent = Sentence(sent, prev_sent)
            for token in sent:
                if token.text in STOP_WORDS or\
                        token.pos_ in ['PART', 'PUNCT', 'SPACE', 'NUM', 'SYM']:
                    continue
                key = token.text.strip() + ' ; ' + token.tag_
                if key not in words:
                    words[key] = Word(token)
                words[key].include_sentence(curr_sent)

        return words

示例#4

0

显示文件

    def spacy_adder(self, model, verbose=False):

        for stopword in self.vocab_list:
            STOP_WORDS.add(stopword)

        model.vocab.add_flag(
            lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS,
            spacy.attrs.IS_STOP)
        if verbose:
            print(
                f"Complete. There are {len(self.vocab_list)} stop words in the list."
            )

示例#5

0

显示文件

def construct_stop_words():
    """
    Update the spacy stopwords list
    :return:
    """
    stop_words_list = [
        "uk", "ceo", "apple", "wal", "st", "q1", "q2", "q3", "q4", "bp",
        "wednesday", "tuesday", "monday", "thursday", "friday", "sept",
        "johnson", "inc", "david", "amazon.com"
    ]

    for words in stop_words_list:
        STOP_WORDS.add(words)

    return STOP_WORDS

示例#6

0

显示文件

def words_stop():
    words_stop._log.debug("\nThe outcomes of words stop are:")
    from spacy.lang.en.stop_words import STOP_WORDS
    # print (STOP_WORDS)
    STOP_WORDS.add("your_additional_stop_word_here")
    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True

    nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"}  # 单个词可以直接.add()
    nlp.Defaults.stop_words -= {"嗯"}  # 单个词可以直接.remove()
    for word in nlp.Defaults.stop_words:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
    words_stop._log.debug(nlp.Defaults.stop_words)

示例#7

0

显示文件

文件： main.py 项目： idochetrit/automatic-text-summarization

def summarization():

  with open("./stories/d3370f0d60746aebcc5f61a068805b8545357e6f.story", "r", encoding="utf-8") as f:
    text = " ".join(f.readlines())
    core = en_core_web_sm.load()

  doc = core(text)
  # clean sentences
  corpus = [sent.text.lower() for sent in doc.sents]
  STOP_WORDS.add("@highlight")
  cv = CountVectorizer(stop_words=list(STOP_WORDS))
  cv_fit = cv.fit_transform(corpus)
  word_list = cv.get_feature_names()
  count_list = cv_fit.toarray().sum(axis=0)
  
  # zip it in a way that pair word and the its count
  word_frequency = dict(zip(word_list, count_list))
  words_freqs = sorted(word_frequency.values())
  higher_word_frequencies = [word for word,
                             freq in word_frequency.items() if freq in words_freqs[-3:]]
  print("higher frequency words : ", higher_word_frequencies)

  higher_frequency = words_freqs[-1]
  # normalise the frequencies values
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word]/higher_frequency)

  sentence_rank = {}
  for sent in doc.sents:
    for word in sent:
      if word.text.lower() in word_frequency.keys():
        if sent in sentence_rank.keys():
          sentence_rank[sent] += word_frequency[word.text.lower()]
        else:
          sentence_rank[sent] = word_frequency[word.text.lower()]
      else:
        continue

  # fetch top sentences which have the higher top-freq words
  top_sentences = (sorted(sentence_rank.values())[::-1])
  top_sent = top_sentences[:3]

  summary = []
  for sent, strength in sentence_rank.items():
    if strength in top_sent:
      summary.append(sent)

  return text, summary

示例#8

0

显示文件

    def clean_text(document):

        stop_words_ = STOP_WORDS.union(stopwords.words('english'))
        stop_words = [unidecode(stop).lower() for stop in stop_words_]
        # Split to translate
        tokens = document.split()
        # Concatenate
        document = ' '.join(tokens)
        # Remove accents
        document = unidecode(document)
        # Remove https, mentions, special characters, single character
        document = re.sub(
            "(@[A-Za-z0-9]+)|(_[A-Za-z0-9]+)|(\w+:\/\/\S+)|(\W_)", " ",
            document).lower()
        # Remove pontuaction
        document = re.sub('[' + string.punctuation + ']', '', document)
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Remove digits
        document = ''.join([i for i in document if not i.isdigit()])
        # Remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Split
        tokens = document.split()
        # Stopwords
        tokens = [w for w in tokens if w not in stop_words]
        # Concatenate
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

示例#9

0

显示文件

def Stop():
    print("\nThe outcomes of Stop Words are:")
    from spacy.lang.en.stop_words import STOP_WORDS
    # print (STOP_WORDS)
    STOP_WORDS.add("your_additional_stop_word_here")
    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
        # print (lexeme.text)

    nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"}  # 单个词可以直接.add()
    nlp.Defaults.stop_words -= {"嗯"}  # 单个词可以直接.remove()
    for word in nlp.Defaults.stop_words:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
        # print (lexeme.text)
    print(nlp.Defaults.stop_words)

示例#10

0

显示文件

文件： keyword_ranking.py 项目： MaximAdler/nlp_detector

    def _set_stopwords(self) -> 'KeywordRanking':
        stop_words = STOP_WORDS.union(self.stopwords) if self.stopwords else STOP_WORDS

        for word in stop_words:
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

        return self

示例#11

0

显示文件

def cluster_commonwords(texts, nwords=10, onlycorona="yes"):
    ignore_words = STOP_WORDS if onlycorona == "no" else STOP_WORDS.union(
        ['coronavirus', 'covid', 'covid19', 'covid-19'])
    allwords = [
        w for w in ' '.join(texts.str.lower()).split()
        if w not in ignore_words and re.search('[a-z]', w)
    ]
    return ', '.join(
        [word for word, cnt in Counter(allwords).most_common(nwords)])

示例#12

0

显示文件

 def set_stopwords(self, stopwords):
     """Set stop words"""
     if self.language == "en":
         for word in STOP_WORDS.union(set(stopwords)):
             lexeme = self.nlp.vocab[word]
             lexeme.is_stop = True
     elif self.language == "de":
         for word in STOP_WORDS_DE.union(set(stopwords)):
             lexeme = self.nlp.vocab[word]
             lexeme.is_stop = True

示例#13

0

显示文件

文件： core.py 项目： yaowang0317/news-topic-classification

def transform_matrix(content_body, tokenize_lemma):
    html_stop_words = get_html_stop_words(content_body, tokenize_lemma)
    stop_words_lemma_train = set(
        tokenize_lemma(' '.join(STOP_WORDS.union(set(html_stop_words)))))
    X = content_body
    tfidf_vectorizer = TfidfVectorizer(max_features=300,
                                       stop_words=stop_words_lemma_train,
                                       tokenizer=tokenize_lemma)
    tfidf_vectorizer = tfidf_vectorizer.fit(X)
    tfidf_matrix = tfidf_vectorizer.transform(X)
    return tfidf_matrix

示例#14

0

显示文件

def remove_stopwords(content):
    custom_stopwords = ("feeling", "feel", "becaus", "want", "time", "realli",
                        "im", "think", "thing", "ive", "still", "littl", "one",
                        "life", "peopl", "need", "bit", "even", "much", "dont",
                        "look", "way", "love", "start", "s", "m", "quot",
                        "work", "get", "http", "go", "day", "com", "got", "see"
                        "4pm", "<BIAS>", "veri", "know", "t", "like", "someth",
                        "good", "going", "today", "u", "new", "cant", "people",
                        "little", "pretty", "things")
    return hero.remove_stopwords(content,
                                 spacy_stop_words.union(custom_stopwords))

示例#15

0

显示文件

文件： writing_style_features.py 项目： tomvin6/author-identification

def preprocess_text(author_df):
    nlp = spacy.load('en')
    STOP_WORDS.add("'s")
    STOP_WORDS.add('the')
    STOP_WORDS.add('a')
    for word in STOP_WORDS:
        nlp.vocab[word].is_stop = True
    doc = author_df.text.apply(nlp)

    # remove stop words and punctuations
    clean_and_lemmatize = lambda x: ' '.join([t.lemma_ for t in x if not t.is_punct and not t.is_stop])
    author_df['text_cleaned'] = doc.apply(clean_and_lemmatize)

    # enteties
    author_df['text_with_entities'] = doc.apply(replace_ents)

    # pos-tag pairs
    author_df['text_pos_tag_pairs'] = author_df['text'].apply(lambda row: pos_tag_pairs_sentence(row))

    # additional nlp meta features
    author_df['polarity_of_text'] = author_df['text'].apply(lambda row: get_polarity(row))
    author_df['punct_cnt'] = doc.apply(lambda x: len([t for t in x if t.is_punct]))
    author_df['words_cnt'] = doc.apply(lambda x: len([t for t in x if not t.is_punct]))
    author_df['ents_cnt'] = doc.apply(lambda x: len(x.ents))
    author_df['noun_chunks_cnt'] = doc.apply(lambda x: len(list(x.noun_chunks)))
    author_df['fraction_noun'] = author_df['text'].apply(lambda row: fraction_noun(row))
    author_df['fraction_adj'] = author_df['text'].apply(lambda row: fraction_adj(row))
    author_df['fraction_verbs'] = author_df['text'].apply(lambda row: fraction_verbs(row))

    return author_df

示例#16

0

显示文件

文件： PreProcessing.py 项目： chiraggwalani/chatappwithsentiment

def preprocess(texts):

    texts = str(texts)
    texts = texts.lower()
    texts = re.sub(r"(http|@)\S+", " ", texts)
    texts = demojize(texts)
    texts = re.sub(r"’", "'", texts)
    texts = re.sub("n't", "n not", texts)
    texts = re.sub("'ll", " will", texts)
    texts = re.sub("'ve", " have", texts)
    texts = re.sub(r"[^a-z\':_]", " ", texts)
    texts = re.sub(r"[0-9]+", " ", texts)
    texts = re.sub("re-[a-z]+", " ", texts)
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    texts = re.sub(pattern, r"\1", texts)

    tokens = tokenizer(texts)
    try:
        STOP_WORDS.remove('not')
        STOP_WORDS.remove('nor')
        STOP_WORDS.remove('no')
    except:
        pass

    lemma_list = []
    for token in tokens:
        if token not in STOP_WORDS:
            lemma_list.append(token.lemma_)
    texts = ' '.join(map(str, lemma_list))
    pred_vect = vectorizer.transform([texts])
    texts = label.classes_[model.predict(pred_vect)]
    texts = ' '.join(map(str, texts))

    return texts

示例#17

0

显示文件

def scripts_to_tfidf(scripts):
    """Create Tfidf matrix from tokenized scripts."""
    # custom stop words for scripts
    film_stop_words = ['V.O.', "Scene", "CUT TO", "FADE IN"]
    stop_words = STOP_WORDS.union(film_stop_words)

    # vectorize scripts into Tfidf matrix
    vectorizer = TfidfVectorizer(input='content', stop_words=stop_words, min_df=0.2,
                                 ngram_range=(1, 2))  # less than 20% frequency words are removed.
    bow = vectorizer.fit_transform(scripts)
    vocab = vectorizer.get_feature_names()

    return bow, vocab

示例#18

0

显示文件

文件： DetectText.py 项目： prashantshukla-qa/AltEye-Test-Harness

    def detectTextIn(self, Text):
        classFromText = []
        classFromText.append(Text)
        # Text=Text.lower()
        nlp = spacy.load('en_core_web_sm')
        # Adding Custom stop words
        STOP_WORDS.add("picture")
        STOP_WORDS.add("image")
        STOP_WORDS.add("images")
        STOP_WORDS.add("pics")
        STOP_WORDS.add("portrait")
        for word in STOP_WORDS:
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

        uni_string = str(Text)
        doc = nlp(uni_string)
        for ent in doc.ents:
            classFromText.append(ent.label_)

        Text = Text.lower()
        uni_string = str(Text)
        doc = nlp(uni_string)

        for token in doc:
            # """token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            #       token.shape_, token.is_alpha, token.is_stop"""
            if not token.is_stop:
                classFromText.append(token.lemma_)
                classFromText.append(token.text)

        classFromText = [a.lower() for a in classFromText]
        for text in classFromText:
            if text == "":
                classFromText.remove(text)
        classFromText = set(classFromText)
        return classFromText

示例#19

0

显示文件

def stage2(process_folder, label):

    path_stage1 = process_folder + label + 'stage1.json'

    from spacy.lang.en.stop_words import STOP_WORDS

    path_stage2 = process_folder + label + 'stage2.json'

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    with open(path_stage2, 'w') as f:
        for rl in normalize_key_phrases(
                path_stage1, ranks, stopwords=STOP_WORDS.union(STOP_WORDS)):
            f.write("%s\n" % pretty_print(rl._asdict()))

示例#20

0

显示文件

def prepare_stopwords():
    NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

    stopwords = STOP_WORDS.copy()
    for word in STOP_WORDS:
        if word in NEGATE:
            stopwords.remove(word)

    return stopwords

示例#21

0

显示文件

文件： get_stop_words.py 项目： anmolkapoor/explore-arxiv-using-lda-gensim-topic-modelling

def get_baseline():
    print('Spacy:', len(STOP_WORDS))
    sw_sk = set(stop_words.ENGLISH_STOP_WORDS)
    print('sklearn', len(sw_sk))

    sw = set(
        pd.read_csv(
            'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words',
            header=None,
            squeeze=True).tolist())

    print('web', len(sw))

    all = STOP_WORDS.union(sw).union(sw_sk)
    print('all', len(all))
    pd.Series(sorted(list(all))).to_csv('baseline.csv', index=False)

示例#22

0

显示文件

import numpy as np
# from sklearn.externals.six import StringIO
# from sklearn.tree import export_graphviz
# import pydotplus
# from IPython.display import Image
from spellchecker import SpellChecker
import pickle

#using smote to deal imbalance
#from imblearn.over_sampling import SMOTE
# this symbol seems to have higher weightage in the final words when Naive Bayes is used,
# so adding it to punctuations to filter
punctuations = string.punctuation + "".join(
    ["...", "..........", "....", "--", "/"])
nlp = spacy.load("en_core_web_sm")
STOP_WORDS = STOP_WORDS.union(CUSTOM_STOP_WORDS)

#excluding NO from stopwords for our use
#STOP_WORDS.discard("no")
#STOP_WORDS.discard("not")
#STOP_WORDS.discard("off")

Urban_vocab = pd.read_csv("urbandict-word-def.csv")
Urban_vocab = Urban_vocab["WORD"].tolist()
# contraction_log = open("1_contractions.log", "w")
# slang_log = open("1_slang.log", "w")
out_vocab = open("1_o_vocab.log", "w")
corpus_vocab = open("1_corpus_vocab.log", "w")

parser = English()
p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.SMILEY, p.OPT.NUMBER,

示例#23

0

显示文件

文件： utils.py 项目： ksuzu46/plug-and-play-resume

 def set_stopwords(self, stopwords):
     """Set stop words"""
     for word in STOP_WORDS.union(set(stopwords)):
         lexeme = nlp.vocab[word]
         lexeme.is_stop = True

示例#24

0

显示文件

文件： aux.py 项目： MarcosFP97/eXtream

def queryTokens(cadena, languages):
    cadena = __preprocessString(cadena)
    # remove non ascii-characters
    cadena = ''.join(i for i in cadena if ord(i) < 128)
    cadena = cadena.strip()  # remove initial and end spaces
    word_tokens = word_tokenize(cadena)
    # Detect in which language the text is written
    lang = detect_language(word_tokens, languages)
    stop_words = set(stopwords.words(lang))  # Filtering stop words
    inverters = set([
        'dont', 'doesnt', 'havent', 'arent', 'didnt', 'wasnt', 'werent', 'not',
        'never', 'hardly', 'seldom'
    ])
    incrementers = set(['too', 'many', 'much', 'very', 'lots'])
    STOP_WORDS.add('im')
    STOP_WORDS.add('pm')
    STOP_WORDS.add('ai')
    STOP_WORDS.add('ie')
    STOP_WORDS.add('still')
    STOP_WORDS.add('cant')
    STOP_WORDS.add('isnt')
    STOP_WORDS.add('couldnt')
    STOP_WORDS.add('youre')
    STOP_WORDS.add('seen')
    STOP_WORDS.add('say')
    STOP_WORDS.add('says')
    STOP_WORDS.add('tell')
    STOP_WORDS.add('lot')
    STOP_WORDS.add('lol')
    STOP_WORDS.add('hes')
    STOP_WORDS.add('s')
    STOP_WORDS.add('be')
    filtered_sentence = [
        w for w in word_tokens if not w in stop_words and not w in inverters
        and not w in incrementers and not w in STOP_WORDS
    ]  # Checking not in stop_words
    return filtered_sentence

示例#25

0

显示文件

文件： generate_matrix.py 项目： rashad101/kgirnet

import os
import re
from unidecode import unidecode
import numpy as np
import json
import sys
import logging
from numpy.linalg import norm
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_model
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add('de_l_la_le_di')
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)-5.5s]  %(message)s",
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger()


class MemoryGenerator():
    def __init__(self, dataset, conv2kg, kgs, fasttext_emb_path):
        logger.info("Initializing Memory Generator ....")
        self.conv2kg = conv2kg
        self.kgs = kgs
        self.mapping = json.load(open("data/" + dataset + "/ERmapping.json"))
        self.maxEntity, self.maxRel = self.read_dataset(dataset)
        logger.info("MaxENT: " + str(self.maxEntity) + " maxREL: " +
                    str(self.maxRel))
        self.matrix_dim = self.maxEntity + self.maxRel
        self.word_emb = load_facebook_model(
            datapath(os.getcwd() + "/" + fasttext_emb_path))

示例#26

0

显示文件

文件： TermGenerator.py 项目： dennlinger/hypergraph-document-store

    def __init__(self,
                 num_distinct_documents=5000,
                 replace_entities=True,
                 max_term_length=127,
                 remove_stopwords=True,
                 custom_stopwords=[
                     ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?',
                     'I', '(', ')'
                 ],
                 analyze=False,
                 document_tabe_name="documents",
                 sentence_table_name="sentences",
                 sentence_fields=OrderedDict({
                     "doc_id": "document_id",
                     "sen_id": "sentence_id",
                     "content": "sentence_text"
                 }),
                 term_table_name="terms",
                 term_sql_format=("term_id", "term_text", "is_entity"),
                 term_occurrence_table_name="term_occurrence",
                 term_occurrence_sql_format=("document_id", "sentence_id",
                                             "term_id"),
                 entity_table_name="entities",
                 entity_sql_format=("entity_id", "entity_type"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/TermGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes various parameters, registers logger and MongoConnector, and sets up the limit.
        :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries.
               For performance reasons, this should be limited during debugging/development.
               0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit().
        :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised.
               The reason for this is that single terms might be merged together to one term, i.e. first and last name:
               "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False),
               whereas - if set to true - "Dennis Aumiller" would represent only one entity.
        :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table).
        :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still
               deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists.
        :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time.
        :param analyze: (boolean) Whether or not to include analytically relevant metrics.
        :param document_tabe_name: (str) Name of the table where the document information is stored.
        :param sentence_table_name: (str) Name of the table where the sentence information will be stored.
        :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the
               sentence table and its fields.
        :param term_table_name: (str) Name of the Postgres tables for the terms.
        :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices.
        :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences
        :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences.
        :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information.
        :param entity_sql_format: (str) Same as term_sql_format, but for entities.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        """
        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info("Successfully registered logger to TermGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to TermGenerator.")

        # PostgresConnector
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # do this earlier since we need it already for the distinct documents.
        self.document_table_name = document_tabe_name
        # get the distinct IDs for the documents so we can match against them later
        # since we have removed parts of the document collection, we have to make sure to get this from Postgres.
        self.logger.info("Parsing relevant documents from Postgres...")
        with self.pc as open_pc:
            open_pc.cursor.execute("SELECT document_id FROM {}".format(
                self.document_table_name))
            self.first_distinct_documents = list(open_pc.cursor.fetchall())
            # extract from the tuple structure
            self.first_distinct_documents = [
                el[0] for el in self.first_distinct_documents
            ]
            self.logger.info("Retrieved all relevant documents from Postgres.")

        # additionally restrict if we want only a number of documents.
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Limiting to the first N entries.")
            self.first_distinct_documents = self.first_distinct_documents[:self
                                                                          .
                                                                          num_distinct_documents]

        self.replace_entities = replace_entities
        self.analyze = analyze

        self.max_term_length = max_term_length

        self.nlp = spacy.load("en")

        # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether
        # there are any entities in the current sentence with higher efficiency.
        self.occurrence_dict = {}
        self.occurring_entities = []

        # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed",
        # it is first created as a list and later cast to Counter and set.
        self.terms = []  # cast into a set later on.
        self.term_in_sentence = set()
        self.term_id = {}
        self.term_is_entity = {}
        if self.analyze:
            self.term_count = Counter()
            self.entity_count = Counter()

        self.entities = []
        self.sentences = []
        self.processed_sentences = []

        # Postgres tables
        if not sentence_fields:
            self.logger.error("No sentence fields specified!")
        self.sentence_table_name = sentence_table_name
        self.sentence_fields = sentence_fields
        if not term_sql_format:
            self.logger.error("No term fields specified!")
        self.term_table_name = term_table_name
        self.term_sql_format = ", ".join(term_sql_format)
        if not term_occurrence_sql_format:
            self.logger.error("No term occurrence fields specified!")
        self.term_occurrence_table_name = term_occurrence_table_name
        self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format)
        if not entity_sql_format:
            self.logger.error("No entity fields specified!")
        self.entity_table_name = entity_table_name
        self.entity_sql_format = ", ".join(entity_sql_format)

        # value retrieving parse:
        self.sentence_values_to_retrieve = {
            key: 1
            for key in self.sentence_fields.keys()
        }
        # suppress _id if not present:
        if "_id" not in self.sentence_values_to_retrieve.keys():
            self.sentence_values_to_retrieve["_id"] = 0
        self.sentence_sql_format = ", ".join(
            [value for value in self.sentence_fields.values()])

        # create union of stop words, and add potentially custom stop words
        self.remove_stopwords = remove_stopwords
        self.removed_counter = 0
        self.stopwords = STOP_WORDS.union(set(stopwords.words("english")))
        # add custom stopwords.
        for word in custom_stopwords:
            self.stopwords.add(word)

        self.logger.info("Successfully initialized TermGenerator.")

示例#27

0

显示文件

文件： merge_utils.py 项目： sameersingh/answer-generation

import hashlib
from pytorch_pretrained_bert import BertTokenizer
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])
STOP_WORDS.update(string.punctuation)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def bert_tokenization_length(context, question, reference, candidate):
    context_len = len(tokenizer.tokenize(context))
    question_len = len(tokenizer.tokenize(question))
    candidate_len = len(tokenizer.tokenize(candidate))
    reference_len = len(tokenizer.tokenize(reference))

    return max(context_len + question_len + candidate_len,
               context_len + question_len + reference_len)


def check_data_and_return_hash(context, question, reference, candidate):
    assert type(context) == type(question) == type(reference) == type(
        candidate) == str

    if context == '' or question == '' or reference == '' or candidate == '':
        return None

    sample = context + question + reference + candidate
    hash_object = hashlib.md5(sample.encode())

示例#28

0

显示文件

文件： tfidf.py 项目： friediisch/NLP-Project

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import re
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import os
import json
from pathlib import Path

nlp = spacy.load('en_core_web_lg')

with open('stopwords.txt', 'r', encoding='utf-8') as f:
    STOPWORDS = f.readlines()
    STOPWORDS = set([item.strip(string.whitespace) for item in STOPWORDS])
    STOP_WORDS = STOP_WORDS.union(STOPWORDS)

# encodings:
replace_dict = {
    '\ufb01': 'fi',
    '\u2019': '',
    '\u00e9': 'e',
    '\u00a8': '',
    'ямБ': 'fi',
}

documents = []  #  [ [token, token, token], [token, token, token], ...]

fp = '../data/LRECjson/'
for jsonfile in os.listdir(Path(fp)):
    #for jsonfile in ['../data/LRECjson/2018_1049.json']:

示例#29

0

显示文件

文件： 1_text_classification.py 项目： isakhawat/natural-language-processing-NLP-all-in-one

Windows:python -m spacy download en as Administrator

Linux:sudo python -m spacy download en
"""

nlp = spacy.load('en')

"""#Exploring spaCy"""

from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS

f'There are {len(STOP_WORDS)} stopwords in spaCy'

# You can add your own corpora specific STOPWORDS using the .add syntax
STOP_WORDS.add("your_additional_stop_word_here")
f'After adding your own stop words, spaCy will use {len(STOP_WORDS)} stopwords'

doc = nlp("I am learning the most important ideas Natural Language Processing ideas using Python")
print(doc)  # doc is a spaCy object which stores the entire document string

"""**About spaCy objects**"""

for token in doc:
    print(token)

simplified_doc = [token for token in doc if not token.is_punct | token.is_stop]
simplified_doc
# please note that .orth_ attribute returns the unicode string representation of the token

"""We can also check what other things we know about these tags in the simplified_doc:"""

示例#30

0

显示文件

文件： load_classifier.py 项目： eduardocesar93/imdb-ml

import pickle
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en', disable=['parser'])


CLASSIFIER_ROOT = 'classifiers/'
TRANSFORMERS = ['transform_bag_of_words_0.sav',
                'transform_bag_of_words_1.sav']
MODELS = ['nb.sav']


STOP_WORDS.add("'s")
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def load_model(model_name):
    with open('{0}{1}'.format(CLASSIFIER_ROOT, model_name), 'rb') as f:
        model = pickle.load(f)
    return model

CLF_NB = load_model(MODELS[0])
TRANSFORMERS_MODELS = [load_model(TRANSFORMERS[0]), load_model(TRANSFORMERS[1])]

def clean_html(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext.lower()