示例#1
0
def debate_text_process(text):
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(str(text))
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    from spacy.lang.en.stop_words import STOP_WORDS
    stop_words = set(stopwords.words('english'))

    STOP_WORDS.update(stop_words)
    STOP_WORDS.update({
        'nt', 'okay', 'ha', 'thank', 'wa', 'got', 'oh', 'said', 'going',
        'want', 'let', 'know'
    })
    words = [w for w in words if not w in STOP_WORDS]
    #print(len(STOP_WORDS))

    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(w) for w in words]
    return words
示例#2
0
 def load_custom_stoplist(self, stoplist_file):
     """Load custom stoplist."""
     with open(stoplist_file, 'r') as f:
         stoplist = f.read().split('\n')
     for item in stoplist:
         STOP_WORDS.add(item)
         self.nlp.vocab[item].is_stop = True
示例#3
0
    def _collect_words(self):
        """Collects all the unique word and pos_tag pairs from the text."""
        nlp = spacy.load("en_core_web_lg")
        # coref = NeuralCoref(nlp.vocab)
        # nlp.add_pipe(coref, name='neuralcoref')

        print("Preparing Spacy object")
        nlp.max_length = len(self.text)
        text_obj = nlp(str(self.text.lower()), disable=['NER'])

        print("Preparing Spacy object")
        # Resolve co-reference using neuralcoref
        # self.text = text_obj._.coref_resolved
        # nlp.remove_pipe("neuralcoref")
        # text_obj = nlp(str(self.text.lower()), disable=['NER'])

        prev_sent = Sentence(nlp(''), None)
        words = {}
        STOP_WORDS.add('_')
        logging.info("Collecting words")
        for sent in tqdm(text_obj.sents):
            # sent = nlp(Sentence.clean_sentence(sent.text))
            curr_sent = Sentence(sent, prev_sent)
            for token in sent:
                if token.text in STOP_WORDS or\
                        token.pos_ in ['PART', 'PUNCT', 'SPACE', 'NUM', 'SYM']:
                    continue
                key = token.text.strip() + ' ; ' + token.tag_
                if key not in words:
                    words[key] = Word(token)
                words[key].include_sentence(curr_sent)

        return words
示例#4
0
    def spacy_adder(self, model, verbose=False):

        for stopword in self.vocab_list:
            STOP_WORDS.add(stopword)

        model.vocab.add_flag(
            lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS,
            spacy.attrs.IS_STOP)
        if verbose:
            print(
                f"Complete. There are {len(self.vocab_list)} stop words in the list."
            )
示例#5
0
def construct_stop_words():
    """
    Update the spacy stopwords list
    :return:
    """
    stop_words_list = [
        "uk", "ceo", "apple", "wal", "st", "q1", "q2", "q3", "q4", "bp",
        "wednesday", "tuesday", "monday", "thursday", "friday", "sept",
        "johnson", "inc", "david", "amazon.com"
    ]

    for words in stop_words_list:
        STOP_WORDS.add(words)

    return STOP_WORDS
示例#6
0
def words_stop():
    words_stop._log.debug("\nThe outcomes of words stop are:")
    from spacy.lang.en.stop_words import STOP_WORDS
    # print (STOP_WORDS)
    STOP_WORDS.add("your_additional_stop_word_here")
    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True

    nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"}  # 单个词可以直接.add()
    nlp.Defaults.stop_words -= {"嗯"}  # 单个词可以直接.remove()
    for word in nlp.Defaults.stop_words:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
    words_stop._log.debug(nlp.Defaults.stop_words)
def summarization():

  with open("./stories/d3370f0d60746aebcc5f61a068805b8545357e6f.story", "r", encoding="utf-8") as f:
    text = " ".join(f.readlines())
    core = en_core_web_sm.load()

  doc = core(text)
  # clean sentences
  corpus = [sent.text.lower() for sent in doc.sents]
  STOP_WORDS.add("@highlight")
  cv = CountVectorizer(stop_words=list(STOP_WORDS))
  cv_fit = cv.fit_transform(corpus)
  word_list = cv.get_feature_names()
  count_list = cv_fit.toarray().sum(axis=0)
  
  # zip it in a way that pair word and the its count
  word_frequency = dict(zip(word_list, count_list))
  words_freqs = sorted(word_frequency.values())
  higher_word_frequencies = [word for word,
                             freq in word_frequency.items() if freq in words_freqs[-3:]]
  print("higher frequency words : ", higher_word_frequencies)

  higher_frequency = words_freqs[-1]
  # normalise the frequencies values
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word]/higher_frequency)

  sentence_rank = {}
  for sent in doc.sents:
    for word in sent:
      if word.text.lower() in word_frequency.keys():
        if sent in sentence_rank.keys():
          sentence_rank[sent] += word_frequency[word.text.lower()]
        else:
          sentence_rank[sent] = word_frequency[word.text.lower()]
      else:
        continue

  # fetch top sentences which have the higher top-freq words
  top_sentences = (sorted(sentence_rank.values())[::-1])
  top_sent = top_sentences[:3]

  summary = []
  for sent, strength in sentence_rank.items():
    if strength in top_sent:
      summary.append(sent)

  return text, summary
示例#8
0
    def clean_text(document):

        stop_words_ = STOP_WORDS.union(stopwords.words('english'))
        stop_words = [unidecode(stop).lower() for stop in stop_words_]
        # Split to translate
        tokens = document.split()
        # Concatenate
        document = ' '.join(tokens)
        # Remove accents
        document = unidecode(document)
        # Remove https, mentions, special characters, single character
        document = re.sub(
            "(@[A-Za-z0-9]+)|(_[A-Za-z0-9]+)|(\w+:\/\/\S+)|(\W_)", " ",
            document).lower()
        # Remove pontuaction
        document = re.sub('[' + string.punctuation + ']', '', document)
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Remove digits
        document = ''.join([i for i in document if not i.isdigit()])
        # Remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Split
        tokens = document.split()
        # Stopwords
        tokens = [w for w in tokens if w not in stop_words]
        # Concatenate
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text
示例#9
0
def Stop():
    print("\nThe outcomes of Stop Words are:")
    from spacy.lang.en.stop_words import STOP_WORDS
    # print (STOP_WORDS)
    STOP_WORDS.add("your_additional_stop_word_here")
    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
        # print (lexeme.text)

    nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"}  # 单个词可以直接.add()
    nlp.Defaults.stop_words -= {"嗯"}  # 单个词可以直接.remove()
    for word in nlp.Defaults.stop_words:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
        # print (lexeme.text)
    print(nlp.Defaults.stop_words)
示例#10
0
    def _set_stopwords(self) -> 'KeywordRanking':
        stop_words = STOP_WORDS.union(self.stopwords) if self.stopwords else STOP_WORDS

        for word in stop_words:
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

        return self
示例#11
0
def cluster_commonwords(texts, nwords=10, onlycorona="yes"):
    ignore_words = STOP_WORDS if onlycorona == "no" else STOP_WORDS.union(
        ['coronavirus', 'covid', 'covid19', 'covid-19'])
    allwords = [
        w for w in ' '.join(texts.str.lower()).split()
        if w not in ignore_words and re.search('[a-z]', w)
    ]
    return ', '.join(
        [word for word, cnt in Counter(allwords).most_common(nwords)])
示例#12
0
 def set_stopwords(self, stopwords):
     """Set stop words"""
     if self.language == "en":
         for word in STOP_WORDS.union(set(stopwords)):
             lexeme = self.nlp.vocab[word]
             lexeme.is_stop = True
     elif self.language == "de":
         for word in STOP_WORDS_DE.union(set(stopwords)):
             lexeme = self.nlp.vocab[word]
             lexeme.is_stop = True
def transform_matrix(content_body, tokenize_lemma):
    html_stop_words = get_html_stop_words(content_body, tokenize_lemma)
    stop_words_lemma_train = set(
        tokenize_lemma(' '.join(STOP_WORDS.union(set(html_stop_words)))))
    X = content_body
    tfidf_vectorizer = TfidfVectorizer(max_features=300,
                                       stop_words=stop_words_lemma_train,
                                       tokenizer=tokenize_lemma)
    tfidf_vectorizer = tfidf_vectorizer.fit(X)
    tfidf_matrix = tfidf_vectorizer.transform(X)
    return tfidf_matrix
示例#14
0
def remove_stopwords(content):
    custom_stopwords = ("feeling", "feel", "becaus", "want", "time", "realli",
                        "im", "think", "thing", "ive", "still", "littl", "one",
                        "life", "peopl", "need", "bit", "even", "much", "dont",
                        "look", "way", "love", "start", "s", "m", "quot",
                        "work", "get", "http", "go", "day", "com", "got", "see"
                        "4pm", "<BIAS>", "veri", "know", "t", "like", "someth",
                        "good", "going", "today", "u", "new", "cant", "people",
                        "little", "pretty", "things")
    return hero.remove_stopwords(content,
                                 spacy_stop_words.union(custom_stopwords))
def preprocess_text(author_df):
    nlp = spacy.load('en')
    STOP_WORDS.add("'s")
    STOP_WORDS.add('the')
    STOP_WORDS.add('a')
    for word in STOP_WORDS:
        nlp.vocab[word].is_stop = True
    doc = author_df.text.apply(nlp)

    # remove stop words and punctuations
    clean_and_lemmatize = lambda x: ' '.join([t.lemma_ for t in x if not t.is_punct and not t.is_stop])
    author_df['text_cleaned'] = doc.apply(clean_and_lemmatize)

    # enteties
    author_df['text_with_entities'] = doc.apply(replace_ents)

    # pos-tag pairs
    author_df['text_pos_tag_pairs'] = author_df['text'].apply(lambda row: pos_tag_pairs_sentence(row))

    # additional nlp meta features
    author_df['polarity_of_text'] = author_df['text'].apply(lambda row: get_polarity(row))
    author_df['punct_cnt'] = doc.apply(lambda x: len([t for t in x if t.is_punct]))
    author_df['words_cnt'] = doc.apply(lambda x: len([t for t in x if not t.is_punct]))
    author_df['ents_cnt'] = doc.apply(lambda x: len(x.ents))
    author_df['noun_chunks_cnt'] = doc.apply(lambda x: len(list(x.noun_chunks)))
    author_df['fraction_noun'] = author_df['text'].apply(lambda row: fraction_noun(row))
    author_df['fraction_adj'] = author_df['text'].apply(lambda row: fraction_adj(row))
    author_df['fraction_verbs'] = author_df['text'].apply(lambda row: fraction_verbs(row))

    return author_df
def preprocess(texts):

    texts = str(texts)
    texts = texts.lower()
    texts = re.sub(r"(http|@)\S+", " ", texts)
    texts = demojize(texts)
    texts = re.sub(r"’", "'", texts)
    texts = re.sub("n't", "n not", texts)
    texts = re.sub("'ll", " will", texts)
    texts = re.sub("'ve", " have", texts)
    texts = re.sub(r"[^a-z\':_]", " ", texts)
    texts = re.sub(r"[0-9]+", " ", texts)
    texts = re.sub("re-[a-z]+", " ", texts)
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    texts = re.sub(pattern, r"\1", texts)

    tokens = tokenizer(texts)
    try:
        STOP_WORDS.remove('not')
        STOP_WORDS.remove('nor')
        STOP_WORDS.remove('no')
    except:
        pass

    lemma_list = []
    for token in tokens:
        if token not in STOP_WORDS:
            lemma_list.append(token.lemma_)
    texts = ' '.join(map(str, lemma_list))
    pred_vect = vectorizer.transform([texts])
    texts = label.classes_[model.predict(pred_vect)]
    texts = ' '.join(map(str, texts))

    return texts
示例#17
0
def scripts_to_tfidf(scripts):
    """Create Tfidf matrix from tokenized scripts."""
    # custom stop words for scripts
    film_stop_words = ['V.O.', "Scene", "CUT TO", "FADE IN"]
    stop_words = STOP_WORDS.union(film_stop_words)

    # vectorize scripts into Tfidf matrix
    vectorizer = TfidfVectorizer(input='content', stop_words=stop_words, min_df=0.2,
                                 ngram_range=(1, 2))  # less than 20% frequency words are removed.
    bow = vectorizer.fit_transform(scripts)
    vocab = vectorizer.get_feature_names()

    return bow, vocab
    def detectTextIn(self, Text):
        classFromText = []
        classFromText.append(Text)
        # Text=Text.lower()
        nlp = spacy.load('en_core_web_sm')
        # Adding Custom stop words
        STOP_WORDS.add("picture")
        STOP_WORDS.add("image")
        STOP_WORDS.add("images")
        STOP_WORDS.add("pics")
        STOP_WORDS.add("portrait")
        for word in STOP_WORDS:
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

        uni_string = str(Text)
        doc = nlp(uni_string)
        for ent in doc.ents:
            classFromText.append(ent.label_)

        Text = Text.lower()
        uni_string = str(Text)
        doc = nlp(uni_string)

        for token in doc:
            # """token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            #       token.shape_, token.is_alpha, token.is_stop"""
            if not token.is_stop:
                classFromText.append(token.lemma_)
                classFromText.append(token.text)

        classFromText = [a.lower() for a in classFromText]
        for text in classFromText:
            if text == "":
                classFromText.remove(text)
        classFromText = set(classFromText)
        return classFromText
示例#19
0
def stage2(process_folder, label):

    path_stage1 = process_folder + label + 'stage1.json'

    from spacy.lang.en.stop_words import STOP_WORDS

    path_stage2 = process_folder + label + 'stage2.json'

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    with open(path_stage2, 'w') as f:
        for rl in normalize_key_phrases(
                path_stage1, ranks, stopwords=STOP_WORDS.union(STOP_WORDS)):
            f.write("%s\n" % pretty_print(rl._asdict()))
示例#20
0
def prepare_stopwords():
    NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

    stopwords = STOP_WORDS.copy()
    for word in STOP_WORDS:
        if word in NEGATE:
            stopwords.remove(word)

    return stopwords
def get_baseline():
    print('Spacy:', len(STOP_WORDS))
    sw_sk = set(stop_words.ENGLISH_STOP_WORDS)
    print('sklearn', len(sw_sk))

    sw = set(
        pd.read_csv(
            'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words',
            header=None,
            squeeze=True).tolist())

    print('web', len(sw))

    all = STOP_WORDS.union(sw).union(sw_sk)
    print('all', len(all))
    pd.Series(sorted(list(all))).to_csv('baseline.csv', index=False)
示例#22
0
import numpy as np
# from sklearn.externals.six import StringIO
# from sklearn.tree import export_graphviz
# import pydotplus
# from IPython.display import Image
from spellchecker import SpellChecker
import pickle

#using smote to deal imbalance
#from imblearn.over_sampling import SMOTE
# this symbol seems to have higher weightage in the final words when Naive Bayes is used,
# so adding it to punctuations to filter
punctuations = string.punctuation + "".join(
    ["...", "..........", "....", "--", "/"])
nlp = spacy.load("en_core_web_sm")
STOP_WORDS = STOP_WORDS.union(CUSTOM_STOP_WORDS)

#excluding NO from stopwords for our use
#STOP_WORDS.discard("no")
#STOP_WORDS.discard("not")
#STOP_WORDS.discard("off")

Urban_vocab = pd.read_csv("urbandict-word-def.csv")
Urban_vocab = Urban_vocab["WORD"].tolist()
# contraction_log = open("1_contractions.log", "w")
# slang_log = open("1_slang.log", "w")
out_vocab = open("1_o_vocab.log", "w")
corpus_vocab = open("1_corpus_vocab.log", "w")

parser = English()
p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.SMILEY, p.OPT.NUMBER,
示例#23
0
 def set_stopwords(self, stopwords):
     """Set stop words"""
     for word in STOP_WORDS.union(set(stopwords)):
         lexeme = nlp.vocab[word]
         lexeme.is_stop = True
示例#24
0
文件: aux.py 项目: MarcosFP97/eXtream
def queryTokens(cadena, languages):
    cadena = __preprocessString(cadena)
    # remove non ascii-characters
    cadena = ''.join(i for i in cadena if ord(i) < 128)
    cadena = cadena.strip()  # remove initial and end spaces
    word_tokens = word_tokenize(cadena)
    # Detect in which language the text is written
    lang = detect_language(word_tokens, languages)
    stop_words = set(stopwords.words(lang))  # Filtering stop words
    inverters = set([
        'dont', 'doesnt', 'havent', 'arent', 'didnt', 'wasnt', 'werent', 'not',
        'never', 'hardly', 'seldom'
    ])
    incrementers = set(['too', 'many', 'much', 'very', 'lots'])
    STOP_WORDS.add('im')
    STOP_WORDS.add('pm')
    STOP_WORDS.add('ai')
    STOP_WORDS.add('ie')
    STOP_WORDS.add('still')
    STOP_WORDS.add('cant')
    STOP_WORDS.add('isnt')
    STOP_WORDS.add('couldnt')
    STOP_WORDS.add('youre')
    STOP_WORDS.add('seen')
    STOP_WORDS.add('say')
    STOP_WORDS.add('says')
    STOP_WORDS.add('tell')
    STOP_WORDS.add('lot')
    STOP_WORDS.add('lol')
    STOP_WORDS.add('hes')
    STOP_WORDS.add('s')
    STOP_WORDS.add('be')
    filtered_sentence = [
        w for w in word_tokens if not w in stop_words and not w in inverters
        and not w in incrementers and not w in STOP_WORDS
    ]  # Checking not in stop_words
    return filtered_sentence
示例#25
0
import os
import re
from unidecode import unidecode
import numpy as np
import json
import sys
import logging
from numpy.linalg import norm
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_model
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add('de_l_la_le_di')
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)-5.5s]  %(message)s",
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger()


class MemoryGenerator():
    def __init__(self, dataset, conv2kg, kgs, fasttext_emb_path):
        logger.info("Initializing Memory Generator ....")
        self.conv2kg = conv2kg
        self.kgs = kgs
        self.mapping = json.load(open("data/" + dataset + "/ERmapping.json"))
        self.maxEntity, self.maxRel = self.read_dataset(dataset)
        logger.info("MaxENT: " + str(self.maxEntity) + " maxREL: " +
                    str(self.maxRel))
        self.matrix_dim = self.maxEntity + self.maxRel
        self.word_emb = load_facebook_model(
            datapath(os.getcwd() + "/" + fasttext_emb_path))
    def __init__(self,
                 num_distinct_documents=5000,
                 replace_entities=True,
                 max_term_length=127,
                 remove_stopwords=True,
                 custom_stopwords=[
                     ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?',
                     'I', '(', ')'
                 ],
                 analyze=False,
                 document_tabe_name="documents",
                 sentence_table_name="sentences",
                 sentence_fields=OrderedDict({
                     "doc_id": "document_id",
                     "sen_id": "sentence_id",
                     "content": "sentence_text"
                 }),
                 term_table_name="terms",
                 term_sql_format=("term_id", "term_text", "is_entity"),
                 term_occurrence_table_name="term_occurrence",
                 term_occurrence_sql_format=("document_id", "sentence_id",
                                             "term_id"),
                 entity_table_name="entities",
                 entity_sql_format=("entity_id", "entity_type"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/TermGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes various parameters, registers logger and MongoConnector, and sets up the limit.
        :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries.
               For performance reasons, this should be limited during debugging/development.
               0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit().
        :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised.
               The reason for this is that single terms might be merged together to one term, i.e. first and last name:
               "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False),
               whereas - if set to true - "Dennis Aumiller" would represent only one entity.
        :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table).
        :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still
               deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists.
        :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time.
        :param analyze: (boolean) Whether or not to include analytically relevant metrics.
        :param document_tabe_name: (str) Name of the table where the document information is stored.
        :param sentence_table_name: (str) Name of the table where the sentence information will be stored.
        :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the
               sentence table and its fields.
        :param term_table_name: (str) Name of the Postgres tables for the terms.
        :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices.
        :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences
        :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences.
        :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information.
        :param entity_sql_format: (str) Same as term_sql_format, but for entities.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        """
        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info("Successfully registered logger to TermGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to TermGenerator.")

        # PostgresConnector
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # do this earlier since we need it already for the distinct documents.
        self.document_table_name = document_tabe_name
        # get the distinct IDs for the documents so we can match against them later
        # since we have removed parts of the document collection, we have to make sure to get this from Postgres.
        self.logger.info("Parsing relevant documents from Postgres...")
        with self.pc as open_pc:
            open_pc.cursor.execute("SELECT document_id FROM {}".format(
                self.document_table_name))
            self.first_distinct_documents = list(open_pc.cursor.fetchall())
            # extract from the tuple structure
            self.first_distinct_documents = [
                el[0] for el in self.first_distinct_documents
            ]
            self.logger.info("Retrieved all relevant documents from Postgres.")

        # additionally restrict if we want only a number of documents.
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Limiting to the first N entries.")
            self.first_distinct_documents = self.first_distinct_documents[:self
                                                                          .
                                                                          num_distinct_documents]

        self.replace_entities = replace_entities
        self.analyze = analyze

        self.max_term_length = max_term_length

        self.nlp = spacy.load("en")

        # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether
        # there are any entities in the current sentence with higher efficiency.
        self.occurrence_dict = {}
        self.occurring_entities = []

        # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed",
        # it is first created as a list and later cast to Counter and set.
        self.terms = []  # cast into a set later on.
        self.term_in_sentence = set()
        self.term_id = {}
        self.term_is_entity = {}
        if self.analyze:
            self.term_count = Counter()
            self.entity_count = Counter()

        self.entities = []
        self.sentences = []
        self.processed_sentences = []

        # Postgres tables
        if not sentence_fields:
            self.logger.error("No sentence fields specified!")
        self.sentence_table_name = sentence_table_name
        self.sentence_fields = sentence_fields
        if not term_sql_format:
            self.logger.error("No term fields specified!")
        self.term_table_name = term_table_name
        self.term_sql_format = ", ".join(term_sql_format)
        if not term_occurrence_sql_format:
            self.logger.error("No term occurrence fields specified!")
        self.term_occurrence_table_name = term_occurrence_table_name
        self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format)
        if not entity_sql_format:
            self.logger.error("No entity fields specified!")
        self.entity_table_name = entity_table_name
        self.entity_sql_format = ", ".join(entity_sql_format)

        # value retrieving parse:
        self.sentence_values_to_retrieve = {
            key: 1
            for key in self.sentence_fields.keys()
        }
        # suppress _id if not present:
        if "_id" not in self.sentence_values_to_retrieve.keys():
            self.sentence_values_to_retrieve["_id"] = 0
        self.sentence_sql_format = ", ".join(
            [value for value in self.sentence_fields.values()])

        # create union of stop words, and add potentially custom stop words
        self.remove_stopwords = remove_stopwords
        self.removed_counter = 0
        self.stopwords = STOP_WORDS.union(set(stopwords.words("english")))
        # add custom stopwords.
        for word in custom_stopwords:
            self.stopwords.add(word)

        self.logger.info("Successfully initialized TermGenerator.")
import hashlib
from pytorch_pretrained_bert import BertTokenizer
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])
STOP_WORDS.update(string.punctuation)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def bert_tokenization_length(context, question, reference, candidate):
    context_len = len(tokenizer.tokenize(context))
    question_len = len(tokenizer.tokenize(question))
    candidate_len = len(tokenizer.tokenize(candidate))
    reference_len = len(tokenizer.tokenize(reference))

    return max(context_len + question_len + candidate_len,
               context_len + question_len + reference_len)


def check_data_and_return_hash(context, question, reference, candidate):
    assert type(context) == type(question) == type(reference) == type(
        candidate) == str

    if context == '' or question == '' or reference == '' or candidate == '':
        return None

    sample = context + question + reference + candidate
    hash_object = hashlib.md5(sample.encode())
示例#28
0
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import re
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import os
import json
from pathlib import Path

nlp = spacy.load('en_core_web_lg')

with open('stopwords.txt', 'r', encoding='utf-8') as f:
    STOPWORDS = f.readlines()
    STOPWORDS = set([item.strip(string.whitespace) for item in STOPWORDS])
    STOP_WORDS = STOP_WORDS.union(STOPWORDS)

# encodings:
replace_dict = {
    '\ufb01': 'fi',
    '\u2019': '',
    '\u00e9': 'e',
    '\u00a8': '',
    'ямБ': 'fi',
}

documents = []  #  [ [token, token, token], [token, token, token], ...]

fp = '../data/LRECjson/'
for jsonfile in os.listdir(Path(fp)):
    #for jsonfile in ['../data/LRECjson/2018_1049.json']:
Windows:python -m spacy download en as Administrator

Linux:sudo python -m spacy download en
"""

nlp = spacy.load('en')

"""#Exploring spaCy"""

from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS

f'There are {len(STOP_WORDS)} stopwords in spaCy'

# You can add your own corpora specific STOPWORDS using the .add syntax
STOP_WORDS.add("your_additional_stop_word_here")
f'After adding your own stop words, spaCy will use {len(STOP_WORDS)} stopwords'

doc = nlp("I am learning the most important ideas Natural Language Processing ideas using Python")
print(doc)  # doc is a spaCy object which stores the entire document string

"""**About spaCy objects**"""

for token in doc:
    print(token)

simplified_doc = [token for token in doc if not token.is_punct | token.is_stop]
simplified_doc
# please note that .orth_ attribute returns the unicode string representation of the token

"""We can also check what other things we know about these tags in the simplified_doc:"""
示例#30
0
import pickle
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en', disable=['parser'])


CLASSIFIER_ROOT = 'classifiers/'
TRANSFORMERS = ['transform_bag_of_words_0.sav',
                'transform_bag_of_words_1.sav']
MODELS = ['nb.sav']


STOP_WORDS.add("'s")
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def load_model(model_name):
    with open('{0}{1}'.format(CLASSIFIER_ROOT, model_name), 'rb') as f:
        model = pickle.load(f)
    return model

CLF_NB = load_model(MODELS[0])
TRANSFORMERS_MODELS = [load_model(TRANSFORMERS[0]), load_model(TRANSFORMERS[1])]

def clean_html(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext.lower()