示例#1
0
def lemmatize_corpus(liste, threshold, exclusion=True):
    """
    :param list: corpus_words soir la liste des mots des lyrics
    :return: list de mots lemmatizée des lyrics
    """
    words = []
    corpus_words = liste
    word_frequency = {}
    lemmas_words = []
    for text in corpus_words:
        for token in text:
            if token in word_frequency:
                word_frequency[token] += 1
            else:
                word_frequency[token] = 1
    #print("word frequency : ",word_frequency)
    for text in corpus_words:
        for token in text:
            if exclusion:
                if word_frequency[token] > threshold:
                    words.append(token)
                    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC,
                                            LEMMA_RULES)
                    lemmas = lemmatizer(token, u'NOUN')
                    lemmas_words.append(" ".join(lemmas))
            else:
                lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
                lemmas = lemmatizer(token, u'NOUN')
                # print("token : ",token," Lemmatization : ",lemmas)
                lemmas_words.append(" ".join(lemmas))
    #print("lemmas words : \n", lemmas_words)
    return lemmas_words
示例#2
0
    def __init__(self,
                 disable: list = None,
                 stopwords: list = None,
                 batch_size: int = None,
                 ngram_range: Tuple[int, int] = None,
                 lemmas=False,
                 lowercase: bool = None,
                 alphas_only: bool = None):
        """
        :param disable: pipeline processors to omit; if nothing should be disabled,
         pass an empty list
        :param stopwords: a set of words to skip
        :param batch_size: a batch size for internal spaCy multi-threading
        :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
        (1, 2), for bigrams only should be set to (2, 2)
        :param lemmas: weather to perform lemmatizing or not while tokenizing, currently works only
        for the English language
        :param n_threads: a number of threads for internal spaCy multi-threading
        """
        if disable is None:
            disable = ['parser', 'ner']
        self._stopwords = stopwords or []

        self.model = spacy.load('en', disable=disable)
        self.tokenizer = Tokenizer(self.model.vocab)
        self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES,
                                     LOOKUP)
        self.batch_size = batch_size
        self.ngram_range = ngram_range
        self.lemmas = lemmas
        self.lowercase = lowercase
        self.alphas_only = alphas_only
def solve(word_list):
    wnl = stem.WordNetLemmatizer()
    porter = stem.porter.PorterStemmer()
    a = [porter.stem(word) for word in word_list]
    b = [wnl.lemmatize(word) for word in word_list]
    lemmatizer = Lemmatizer()
    c = [lemmatizer.lookup(word) for word in word_list]
    res = {}
    res['a'] = a
    res['b'] = b
    res['c'] = c
    return res
示例#4
0
 def __init__(self,
              filter_score=DEFAULT_FILTER_CONTEXT_SCORE,
              proxy_server=DEFAULT_PROXY_SERVER):
     self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
     self.wikipedia_cache = {}
     self.fetcher = AsyncWikiSearcher(proxy_server)
     self.graph_data = GraphData()
     self.wikidata_property_table = WikiDataPropertyTable.get_instance()
     self.embedding = {}
     self.filter_score = filter_score
     self.NLP = SpacyNLPFactory.create_simple_nlp_pipeline()
     self.all_domain_vector = {}
示例#5
0
    def lemmatize(self, *args, **kwargs):
        if not self._lemmatizer:
            from spacy.lemmatizer import Lemmatizer
            from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

            self._lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        return self._lemmatizer(*args, **kwargs)[0]
示例#6
0
def task_three(doc1):
    tokens = list("")
    grammar_tags = dict()
    part_of_speech_tags = list()
    depend_tags = dict()
    lemmas = dict()
    for tokk in doc1:
        tokens.append(tokk.text)
        grammar_tags[tokk] = tokk.tag_
        part_of_speech_tags.append(tokk.pos_)
        if tokk.dep_ in depend_tags.keys():
            value_set = depend_tags[tokk.dep_]
            value_set.add(tokk)
        else:
            value_set = set()
            value_set.add(tokk)
            depend_tags[tokk.dep_] = value_set

    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    for i in range(0, len(tokens)):
        lemmas[tokens[i]] = lemmatizer(tokens[i], part_of_speech_tags[i])

    print("Tokenize")
    print(tokens)
    print()
    print("Lemmatize")
    print(lemmas)
    print()
    print("POS Tags")
    print(grammar_tags)
    print()
    print("Dependency Parse Tree")
    print(depend_tags)
    print()

    for tokk in tokens:
        syn = wn.synsets(tokk)
        hypernym = list("")
        hyponym = list("")
        holonym = list("")
        meronym = list("")
        for synset in syn:
            hypernym.append(synset.hypernyms())
            hyponym.append(synset.hyponyms())
            holonym.append(synset.part_holonyms())
            meronym.append(synset.part_meronyms())
        print(tokk)
        print()
        print("Hypernyms")
        print(hypernym)
        print()
        print("Hyponyms")
        print(hyponym)
        print()
        print("Holonyms")
        print(holonym)
        print()
        print("Meronyms")
        print(meronym)
        print()
示例#7
0
    def __init__(self, inferenceEngine, colorFile="corpora/colors.csv", sizeFile="corpora/sizes.txt", shapeFile="corpora/shapes.txt", nerModel="models/nerModel"):
        self.query = ""
        self.nlp = spacy.load('en')
        ner = spacy.load(nerModel).pipeline[0][1]
        self.nlp.replace_pipe("ner", ner)

        self.inferenceEngine = inferenceEngine

        self.matcher = Matcher(self.nlp.vocab)
        self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        self.scene = {
            "objects": [],
            "backgrounds": []
        }
        self.subjects = {}
        self.referenceWords = ["the", "it", "that", "his", "hers", "theirs"]
        self.colors = {}
        with open(colorFile, "r") as colorReader:
            for line in colorReader:
                colorValue = line.split(",")
                self.colors[colorValue[0].lower()] = colorValue[1].strip("\n")

        self.sizes = {}
        with open(sizeFile, "r") as sizeReader:
            for line in sizeReader:
                line = line.strip().lower()
                sizeValue = line.split(",")
                self.sizes[sizeValue[0]] = sizeValue[1].strip("\n")

        self.shapes = []
        with open(shapeFile, "r")  as shapeReader:
            self.shapes = [shape.strip().lower() for shape in shapeReader]
示例#8
0
def wsd_level(sentence, word):

    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    bert = (model, tokenizer)
    doc = nlp(sentence.lower())
    words = []
    pos_tags = []
    for token in doc:
        words.append(token.text)
        pos_tags.append(token.pos_)

    idx = [i for i, w in enumerate(words) if w == word]

    wsd_results = wsd_bert(words,
                           idx,
                           pos_tags,
                           lemmatizer=lemmatizer,
                           bert=bert)
    #     print(wsd_results)
    wsd_scores = [
        score for example_info in wsd_results
        for example, level, score in example_info
    ]
    wsd_levels = [
        level for example_info in wsd_results
        for example, level, score in example_info
    ]

    #     print(wsd_scores.index(max(wsd_scores)))
    return wsd_levels[wsd_scores.index(max(wsd_scores))]
示例#9
0
    def __init__(self, process, print_mode="single"):
        self.process = process
        self.print_mode = print_mode
        logger = logging.getLogger("program")
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
        logging.root.setLevel(level=logging.INFO)
        logger.info("running %s", ' '.join(sys.argv))
        self.logger = logger
        self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        self.user = getpass.getuser()
        if os.path.isdir("/media/" + self.user + "/Data4/ROSA/db/"):
            self.data_dir = "/media/" + self.user + "/Data4/ROSA/db/"
            self.save_dir = "/media/" + self.user + "/Data4/ROSA/graph/"
        else:
            self.data_dir = "/home/" + self.user + "/Data4/ROSA/db/"
            self.save_dir = "/home/" + self.user + "/Data4/ROSA/graph/"

        if self.process == "update":
            self._convert_raw_data_to_graph()
        # elif self.process == "run":
        self._load_graph()

        self.logger.info("ROSA is now live..")

        while (1):
            text = raw_input(
                "ROSA says: What do you want to know about? ")  # Python 2
            self._query(text)
示例#10
0
    def __init__(self, train_corpus: TextIO):
        """Initialize and train the model.

    Args:
        train_corpus: A open file or text stream with annoted text for trainiing of the model.

    """

        lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        self.coocurrenz = defaultdict(lambda: defaultdict(int))
        self.all_verbs = set()
        self.all_nouns = set()
        for line in train_corpus:
            verbs = set()
            nouns = set()
            for word_pos in line.split():
                (word, pos) = str(word_pos).split('_')
                pos_type = pos[0]
                if (pos_type != 'V') and (pos_type != 'N'):
                    continue  # nothing to do with the word
                word = word[2:]
                if pos_type == 'V':
                    verb = lemmatizer(word, u'VERB')[0]
                    verbs.add(verb)
                elif pos_type == 'N':
                    noun = lemmatizer(word, u'NOUN')[0]
                    nouns.add(noun)
            self.all_nouns.update(nouns)
            self.all_verbs.update(verbs)
            for noun in nouns:
                for verb in verbs:
                    self.coocurrenz[noun][verb] += 1
        self.all_nouns = sorted(self.all_nouns)
        self.all_verbs = sorted(self.all_verbs)
示例#11
0
    def concept_sets(self, value):
        """
        Sets concepts_sets and the attributes derived from it.

        Args:
            value (list of list of str): A list of lists of strings; each string being a concept,
                each set in the larger list corresponding to a document which has the tags seen in the set.
        """
        self._concept_sets = value
        LOG.debug("Extracting raw keywords as concepts.")
        all_concepts = [
            concept
            for concept_set in tqdm(self._concept_sets)
            for concept in concept_set
            if concept.strip() != ""
        ]
        raw_concepts = set(all_concepts)

        LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts)))
        concepts = [c.lower() for c in raw_concepts]

        self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)}
        lookups = Lookups()
        lookups.add_table("lemma_lookup", self.raw2lemma)
        self.lemmatizer = Lemmatizer(lookups)
        self.lemma2raw = {v: k for k, v in self.raw2lemma.items()}
        lemma_concepts = [
            self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts
        ]
        self.concepts_frequencies = Counter(lemma_concepts)
        self.concepts = set(lemma_concepts)
        self._fit_concept_indices()
def lemmatize(descriptions):
    tqdm_notebook().pandas()
    nlp = spacy.load('en')
    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    return descriptions.progress_apply(lambda desc: \
                ' '.join([lemmatizer(token.text, token.pos_)[0] \
                for token in nlp(desc)]))
示例#13
0
def lemmatizer():
    lookups = Lookups()
    lookups.add_table("lemma_lookup", {
        "dogs": "dog",
        "boxen": "box",
        "mice": "mouse"
    })
    return Lemmatizer(lookups)
示例#14
0
 def __init__(self, model="en_core_web_lg", testing=False):
     print("Loading {}...".format(model))
     self.nlp = spacy.load(model)
     self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
     self.title_similarity = 0.90
     print("{} loaded.".format(model))
     if not testing:
         super().__init__()
示例#15
0
 def __init__(self):
     self.nlp = StanfordCoreNLP('http://localhost', port=9000)
     self.spacy = spacy.load('en_core_web_sm')
     self.rewritten = None
     self.sentences = None
     self.text2ent = defaultdict(None)
     self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
     self.adjResolver = AdjectiveResolver()
     self.verbResolver = VerbResolver()
示例#16
0
def lemmatization(tweet, nlp):
    """Returns the lemma of the tweet."""
    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    tweet = nlp(tweet)
    lemmatized = [
        lemmatizer(word.text.lower(), word.pos_)[0] for word in tweet
    ]

    return " ".join(lemma for lemma in lemmatized)
示例#17
0
def lemma_wordlist(word_list):
    lemmatizer = lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    Lemma_list = []
    
    for word in word_list:
        lWord = lemmatizer(word, u"NOUN")
        Lemma_list.append(lWord[0])
        
    return Lemma_list
示例#18
0
def get_lemmatizer():
    if not _spacy['lemmatizer']:
        from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
        from spacy.lemmatizer import Lemmatizer
        lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        _spacy['lemmatizer'] = lemmatizer
    else:
        lemmatizer = _spacy['lemmatizer']
    return lemmatizer
示例#19
0
 def __init__(self, recognizer, source):
     self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
     self.engine = pyttsx3.init()
     self.engine.setProperty(
         'voice', 'com.apple.speech.synthesis.voice.ava.premium')
     self.engine.setProperty('rate', self.engine.getProperty('rate') - 31.5)
     self.nlp = spacy.load("en_core_web_sm")
     self.recognizer = recognizer
     self.source = source
示例#20
0
def lemmatize(sent, nlp):
    lemmatized = []
    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    doc = nlp(str(sent))
    for token in doc:
        i = lemmatizer(str(token), token.pos_)
        # print(i)
        lemmatized.append(i[0])
    return ' '.join(lemmatized)
示例#21
0
def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
    tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
    rules = {"verb": [["ed", "e"]]}
    lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=words)
    doc[2].tag_ = "VB"
    assert doc[2].text == "feed"
    assert doc[2].lemma_ == "feed"
示例#22
0
def test_issue1387():
    tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
    index = {"verb": ("cope", "cop")}
    exc = {"verb": {"coping": ("cope", )}}
    rules = {"verb": [["ing", ""]]}
    lemmatizer = Lemmatizer(index, exc, rules)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=["coping"])
    doc[0].tag_ = "VBG"
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"
示例#23
0
def test_issue1387():
    tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
    lookups = Lookups()
    lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
    lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}})
    lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
    lemmatizer = Lemmatizer(lookups)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=["coping"])
    doc[0].tag_ = "VBG"
    assert doc[0].text == "coping"
    assert doc[0].lemma_ == "cope"
示例#24
0
def lemmatize(data):
    output = []
    lemmatizerEn = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    lemmatizerEs = Lemmatizer(LEMMA_INDEX,
                              LEMMA_EXC,
                              LEMMA_RULES,
                              lookup=spacy.lang.es.LOOKUP)
    bar = ChargingBar('Lemmatizing\t\t\t\t', max=len(data))
    for instance in data:
        new_tweet = {}
        new_tweet['tweetid'] = instance['tweetid']
        new_tweet['tweet'] = instance['tweet']
        new_tweet['tokens'] = []
        new_tweet['langid'] = instance['langid']
        new_tweet['sentiment'] = instance['sentiment']
        for i, word in enumerate(instance['tokens']):
            if (instance['langid'][i] == 'lang1'):
                new_tweet['tokens'].append(lemmatizerEn.lookup(word))
            elif (instance['langid'][i] == 'lang2'):
                new_tweet['tokens'].append(lemmatizerEs.lookup(word))
            else:
                new_tweet['tokens'].append(word)

            # new_tweet['tokens'].append(lemmatizerEn.lookup(word))
        output.append(new_tweet)
        new_tweet = {}
        new_tweet['tweetid'] = instance['tweetid']
        new_tweet['tweet'] = instance['tweet']
        new_tweet['tokens'] = []
        new_tweet['langid'] = []
        new_tweet['sentiment'] = instance['sentiment']
        bar.next()
    bar.finish()
    return output
示例#25
0
 def __init__(self):
     self.entities = []
     self.columns = []
     self.relationships = []
     self.synonyms_col = []
     self.synonyms_tab = []
     self.entity_graph = []
     self.loaded_entities = []
     self.config = Configuration()
     self.conn = pyodbc.connect(self.config.get_sql_connection_string())
     lookups = Lookups()
     self.lemmatizer = Lemmatizer(lookups)
     self.load_db_model()
示例#26
0
    def get_cleaned_text(self):
        '''
        This function clean the text
        '''

        f = gzip.open(self.path, 'rb')
        self.text = f.read().decode('utf-8')

        #removing stop words and words with only one character
        nlp = spacy.load("en_core_web_sm")
        stop_words = set(nlp.Defaults.stop_words)
        self.text = self.text.lower().split(' ')
        self.text = [
            word for word in self.text if word not in stop_words
            if len(word) > 1
        ]

        #lemmatizing the words
        lemmatizer = Lemmatizer()
        self.text = [lemmatizer.lookup(word) for word in self.text]

        return self.text
def word_lemmatizer(data):
    lemmatizer = Lemmatizer(lookup=LOOKUP)
    for doc in data:
        doc["lemma_paragraphs"] = []
        for i, paragraph in enumerate(doc["stopped_paragraphs"]):
            doc["lemma_paragraphs"].append([])
            doc["lemma_paragraphs"][i] = []
            for k, sentence in enumerate(paragraph):
                doc["lemma_paragraphs"][i].append([])
                for idx, word in enumerate(sentence):
                    doc["lemma_paragraphs"][i][k].append(
                        lemmatizer(word, u"NOUN")[0])
    return data
def lemmatizer(d):
    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    doc = nlp(d)
    str_doc = ''
    for tok in doc:
        # rem a lemmatized string is returned as a list
        # so it must be indexed to be 'unpacked' for string comparison
        if tok.text != lemmatizer(tok.text,
                                  tok.pos_)[0] and tok.pos_ == 'NOUN':
            #print(tok.text, lemmatizer(tok.text, tok.pos_), tok.pos_, tok.tag_, '\n')
            str_doc = str_doc + str(lemmatizer(tok.text, tok.pos_)[0]) + ' '
        else:
            str_doc = str_doc + tok.text + ' '
    return str_doc
示例#29
0
def test_issue595():
    """Test lemmatization of base forms"""
    words = ["Do", "n't", "feed", "the", "dog"]
    tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
    lookups = Lookups()
    lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
    lookups.add_table("lemma_index", {"verb": {}})
    lookups.add_table("lemma_exc", {"verb": {}})
    lemmatizer = Lemmatizer(lookups)
    vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
    doc = Doc(vocab, words=words)
    doc[2].tag_ = "VB"
    assert doc[2].text == "feed"
    assert doc[2].lemma_ == "feed"
示例#30
0
def lemmatize_word(word):
    """
    Assign the base form of words
    Input: word (plural, capital letters)
    return: word (singular, lower letters)
    """
    palabra = word
    palabra = Word(str(palabra).lower())
    palabra = palabra.lemmatize()
    if palabra == word:
        lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        palabra = lemmatizer(word, u'NOUN')
        palabra = palabra[0]
    return palabra
示例#31
0
def lemmatizer():
    return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
示例#32
0
def lemmatizer(path):
    if path is not None:
        return Lemmatizer.load(path)
    else:
        return None
示例#33
0
def lemmatizer(package):
    return Lemmatizer.from_package(package)