示例#1
0
    def analyzer(self, question):
        def is_noun(tag):
            return tag in ['NN', 'NNS', 'NNP', 'NNPS']

        def is_verb(tag):
            return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

        def is_adverb(tag):
            return tag in ['RB', 'RBR', 'RBS']

        def is_adjective(tag):
            return tag in ['JJ', 'JJR', 'JJS']

        def penn_to_wn(tag):
            if is_adjective(tag):
                return wn.ADJ
            elif is_noun(tag):
                return wn.NOUN
            elif is_adverb(tag):
                return wn.ADV
            elif is_verb(tag):
                return wn.VERB
            return wn.NOUN

        # "How do i view my course on Canvas"

        keywords_list = []
        tagged_sent = nltk.pos_tag(word_tokenize(question))
        tokenizer = []
        mongo_dict = {}
        for word_tuple in tagged_sent:
            if word_tuple[0] not in self.stop_words and word_tuple[0]:
                word_list = list(word_tuple)
                word_list[0] = re.sub('[!?%$*.@]', '', word_list[0])
                tokenizer.append(tuple(word_list))

        for tag in tokenizer:
            print("tag", tag[0])
            print(self.dictionary.synonym(tag[0].lower()))
            if tag[1] == 'NNP':
                keywords_list.append(tag[0].lower())
            else:
                wn_tag = penn_to_wn(tag[1])
                word = WordNetLemmatizer().lemmatize(tag[0], wn_tag)
                print("word -->", word)
                print("self.dictionary.synonym(word.lower()) -->",
                      self.dictionary.synonym(word.lower()))
                keywords_list.append(word.lower())
                synonym_list = self.dictionary.synonym(word.lower())
                if synonym_list:
                    keywords_list.extend(synonym_list)

        mongo_dict["keywords"] = list(set(keywords_list))
        mongo_dict[
            "text"] = "Yes, Canvas can be integrated with products like: McGraw-Hill Connect, Macmillan Education, Cengage Learning MindTap, and Pearson's MyLab & Mastering. \
Please visit: http://www.sjsu.edu/ecampus/teaching-tools/canvas/integrating-publisher-db/index.html for more information."

        print(mongo_dict)
        self.dbclient.insert(mongo_dict)
示例#2
0
def removeNoise(tokens, stopWords=()):
    cleaned_tokens = []
    for token, tag in pos_tag(tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
        '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
        pos = lemmatize_sentence(tag)
        token = WordNetLemmatizer().lemmatize(token, pos)
        if len(token) > 0 and token not in string.punctuation and token.lower(
        ) not in stopWords:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens
def calcola_fattore_normalizzazione(all_ss, parola):
    nome = WordNetLemmatizer().lemmatize(parola, 'n')
    verbo = WordNetLemmatizer().lemmatize(parola, 'v')
    aggettivo = WordNetLemmatizer().lemmatize(parola, 'a')
    avverbio = WordNetLemmatizer().lemmatize(parola, 'r')

    fattore = 0.1
    for ss in all_ss:
        for l in ss.lemmas():
            if l.name().lower() == nome.lower() or l.name().lower() == verbo.lower() \
                    or l.name().lower() == aggettivo.lower() or l.name().lower() == avverbio.lower():
                if l.count() > fattore:
                    fattore = l.count()

    return fattore
def calcola_posizione_lemma(ss, parola):

    nome = WordNetLemmatizer().lemmatize(parola, 'n')
    verbo = WordNetLemmatizer().lemmatize(parola, 'v')
    aggettivo = WordNetLemmatizer().lemmatize(parola, 'a')
    avverbio = WordNetLemmatizer().lemmatize(parola, 'r')

    i = 0
    for l in ss.lemmas():
        if l.name().lower() == nome.lower() or l.name().lower() == verbo.lower()\
                or l.name().lower() == aggettivo.lower() or l.name().lower() == avverbio.lower():
            return i
        i += 1

    return -1
示例#5
0
def semantic_distractor_de(model, word):
    semantic_lst = []
    lemma = WordNetLemmatizer().lemmatize(word, pos="n")
    lemma = lemma.lower()
    for w in model.most_similar(lemma,[],10):
        semantic_lst.append(w[0])
    return semantic_lst
示例#6
0
def generate_distractor(model, word):
    distractor_set = set()
    #distractor_set.update(lst)
    print("answer:", word)
    print("1.(semantic)", semantic_distractor(model, word))
    #distractor_set.update(hypernym_distractor(word))
    print("2.(shape)" ,shape_distractor(word))
    print("3.(hypernym)", hypernym_distractor(word))
    #distractor_set.update(hyponym_distractor(word))
    print("4.(hyponym)", hyponym_distractor(word))
    #distractor_set.update(synonym_distractor(word))
    #distractor_set.update(antonym_distractor(word))
    antonym_lst = list(set(antonym_distractor(word)))
    print("5.(antonym)", antonym_lst)
    if len(antonym_lst) != 0:
        first_antonym = antonym_lst[0]
        print("6.(antonym's hypernym)", hypernym_distractor(first_antonym))
        print("7.(antonym's hyponym)", hypernym_distractor(first_antonym) )
        print("8.(antonym's shape)", shape_distractor(first_antonym))
        antonym_semantic_lst = []
        antonym_lemma = WordNetLemmatizer().lemmatize(word, pos ="n")
        antonym_lemma = antonym_lemma.lower()
        for w in model.most_similar(antonym_lemma,[],5):
            antonym_semantic_lst.append(w[0])
        print("9.(antonym's semantic)", antonym_semantic_lst)
    #distractor_set.update(shape_distractor(word))
    print("* avoid synonyms:", set(synonym_distractor(word))- set(word) , "\n")
def Q2b():
    # nltk.download('wordnet')
    text = nltk.load('text.txt', encoding='gbk')  # code for Q2a
    token_list = nltk.sent_tokenize(text)
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''", "-"
    ]
    token_list = [nltk.word_tokenize(sen) for sen in token_list]
    new_token = []
    for sens in token_list:
        sens = [word for word in sens if word not in english_punctuations]
        new_token.append(sens)

    new_token = [nltk.pos_tag(sen) for sen in new_token]
    print(new_token)
    lemmatized = []

    for sen in new_token:
        for word in sen:
            if "V" in word[1]:
                w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v')
            else:
                w = WordNetLemmatizer().lemmatize(word[0], 'n')
            lemmatized.append(w.lower())
    # test = [WordNetLemmatizer().lemmatize(new_token)]
    # print(new_token[1])
    print(lemmatized)
示例#8
0
def noiseRemoval(reviewTokens, stop_words=()):
    #  print("review token", reviewTokens)
    cleaned_tokens = []

    for token, tag in pos_tag(reviewTokens):
        token = re.sub("[http[s]?://(!@#$;:!*%)(&^~])", '', token)
        #    print("token" , token)
        token = re.sub(r"http\S+", '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", '', token)
        token = re.sub("[@#:),’]", '', token)
        token = re.sub(r'^https?:\/\/.*[\r\n]*', '', token)
        #token = re.sub("'’", '', token)
        #print(token," ",tag)
        #print(token)

        if tag.startswith("VB"): pos = 'v'
        elif tag.startswith('NN'): pos = 'n'
        else: pos = 'a'

        rootWord = WordNetLemmatizer().lemmatize(token, pos)
        rootWord = rootWord.lower()
        if rootWord not in stop_words and rootWord not in string.punctuation:
            cleaned_tokens.append(rootWord)

    return cleaned_tokens
示例#9
0
def find_lemma_opinion(word):
    if 'not ' in word:
        word = word.replace('not ', '')
        word = WordNetLemmatizer().lemmatize(word,'s')
        word = 'not ' + word
    else:
        word = WordNetLemmatizer().lemmatize(word,'s')
    return word.lower()        
示例#10
0
def unify_word(word):  # went -> go, apples -> apple, BIG -> big
    """unify verb tense and noun singular"""
    ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    for wt in [ADJ, ADJ_SAT, ADV, NOUN, VERB]:
        try:
            word = WordNetLemmatizer().lemmatize(word, pos=wt)
        except:
            pass
    return word.lower()
示例#11
0
def countexp_noun(para, searchText):
    cnt = 0
    searchWords = [WordNetLemmatizer().lemmatize(s.lower(), 'n') for s in searchText]
    for stxt in searchWords:
        for word in para.split():
            w1 = WordNetLemmatizer().lemmatize(word.lower(), 'n')
            if stxt.lower() == w1.lower():
                cnt = cnt + 1
                break
    return cnt
示例#12
0
 def clean(string):
     string = gensim.parsing.preprocessing.remove_stopwords(string)
     string = nltk.word_tokenize(string)
     stopwords = set(nltk.corpus.stopwords.words('english'))
     cleaned_string = []
     for i in string:
         i = WordNetLemmatizer().lemmatize(i.lower())
         if len(i) <= 2:
             continue
         elif i in stopwords:
             continue
         else:
             cleaned_string.append(i)
     return list(set(cleaned_string))
示例#13
0
    def extractSubjectObject(self, sentence):

        sub = None
        obj = None

        for word in sentence:
            if sub == None and word[7] == "nsubj":
                sub = word[2].translate(string.maketrans("", ""),
                                        string.punctuation)
                sub = sub.lower()
                sub = WordNetLemmatizer().lemmatize(sub, pos="n")
            if obj == None and word[7] == "dobj":
                obj = word[2].translate(string.maketrans("", ""),
                                        string.punctuation)
                obj = obj.lower()
                obj = WordNetLemmatizer().lemmatize(obj, pos="n")

        # The sentence did not contain a verb, so we need to back-off to
        # using the tokens tagged with NN (word[4]). Brute-force take the first
        # two tokens tagged with NN.
        #
        # Example use : A big cow in a field. -> cow, field.
        if sub == None:
            for word in sentence:
                if word[4] == "NN":
                    sub = word[2].translate(string.maketrans("", ""),
                                            string.punctuation)
                    sub = sub.lower()
                    sub = WordNetLemmatizer().lemmatize(sub, pos="n")
                    break

        if obj == None:
            for word in sentence:
                if word[4] == "NN":
                    proposal = word[2].translate(string.maketrans("", ""),
                                                 string.punctuation)
                    proposal = proposal.lower()
                    proposal = WordNetLemmatizer().lemmatize(proposal, pos="n")
                    if proposal != sub:
                        obj = proposal
                        break

        return sub, obj
示例#14
0
def to_word_list(query: str):
    """
    Tworzy ze stringa liste słów
    Słowa za zmienone przy pomocy NLTK
    Sprawdzany jest iloczyn słow ze słownikiem słow angielskich
    :param query: Dane do modyfikacji
    :return:
    """
    with open("usage_files/words.txt") as word_file:
        english_words = set(word.strip().lower() for word in word_file)
    tags = pos_tag((using_translate(query)))
    a = []
    for tag in tags:
        wn_tag = penn_to_wn(tag[1])
        word = WordNetLemmatizer().lemmatize(tag[0], wn_tag)
        if word.lower() in english_words:
            a.append(word)
    if len(a) == 0:
        raise ValueError("First 5000 words are not in english")
    return a
def Q3():
    p = porter.PorterStemmer()
    stopwords = []
    with open('stopwords.txt', 'r') as f:
        for line in f:
            stopwords.append(line.rstrip())
        f.close()
    # print(stopwords)
    temp = requests.get("https://www.bbc.com/news/world-us-canada-49871909")
    temp.encoding = 'utf-8'
    soup = BeautifulSoup(temp.content, 'html.parser')
    text_1 = soup.find('div', {'class': 'story-body__inner'}).findAll('p')
    # text_1.remove('<p>')
    text_1 = [part.get_text() for part in text_1]
    text_1 = [nltk.word_tokenize(sen) for sen in text_1]
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''", "-"
    ]
    text_1 = [[word for word in sens if word not in english_punctuations]
              for sens in text_1]
    text_1 = [[word for word in sens if word not in stopwords]
              for sens in text_1]
    text_1 = [nltk.pos_tag(sen) for sen in text_1]
    # print(text_1)

    result = []

    for sen in text_1:
        for word in sen:
            if "V" in word[1]:
                w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v')
            elif "N" in word[1]:
                w = WordNetLemmatizer().lemmatize(word[0], 'n')
            else:
                w = p.stem(word[0])
            result.append(w.lower())
    # print(result)
    fdist = FreqDist(result)
    tops = fdist.most_common(40)
    print(tops)
示例#16
0
  def extractSubjectObject(self, sentence):

    sub = None
    obj = None

    for word in sentence:
      if sub == None and word[7] == "nsubj":
        sub = word[2].translate(string.maketrans("",""), string.punctuation)
        sub = sub.lower()
        sub = WordNetLemmatizer().lemmatize(sub, pos="n")
      if obj == None and word[7] == "dobj":
        obj = word[2].translate(string.maketrans("",""), string.punctuation)
        obj = obj.lower()
        obj = WordNetLemmatizer().lemmatize(obj, pos="n")

    # The sentence did not contain a verb, so we need to back-off to
    # using the tokens tagged with NN (word[4]). Brute-force take the first
    # two tokens tagged with NN.
    #
    # Example use : A big cow in a field. -> cow, field.
    if sub == None:
      for word in sentence:
        if word[4] == "NN":
          sub = word[2].translate(string.maketrans("",""), string.punctuation)
          sub = sub.lower()
          sub = WordNetLemmatizer().lemmatize(sub, pos="n")
          break

    if obj == None:
      for word in sentence:
        if word[4] == "NN":
          proposal = word[2].translate(string.maketrans("",""), string.punctuation)
          proposal = proposal.lower()
          proposal = WordNetLemmatizer().lemmatize(proposal, pos="n")
          if proposal != sub:
            obj = proposal
            break

    return sub, obj
示例#17
0
    def analyzer(self, question):
        # "How do i view my course on Canvas"
        def is_noun(tag):
            return tag in ['NN', 'NNS', 'NNP', 'NNPS']

        def is_verb(tag):
            return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

        def is_adverb(tag):
            return tag in ['RB', 'RBR', 'RBS']

        def is_adjective(tag):
            return tag in ['JJ', 'JJR', 'JJS']

        def penn_to_wn(tag):
            if is_adjective(tag):
                return wn.ADJ
            elif is_noun(tag):
                return wn.NOUN
            elif is_adverb(tag):
                return wn.ADV
            elif is_verb(tag):
                return wn.VERB
            return wn.NOUN

        keywords_list = []
        tagged_sent = nltk.pos_tag(word_tokenize(question))

        for tag in tagged_sent:
            if tag[0].lower() not in self.stop_words and tag:
                wn_tag = penn_to_wn(tag[1])
                word = WordNetLemmatizer().lemmatize(tag[0], wn_tag)
                keywords_list.append(word.lower())
        print('------------------------------------')
        print(keywords_list, "keywords_list")
        print('------------------------------------')
        response = self.dbclient.findAll(keywords_list)
        return response
示例#18
0
def reduced_form(word):
    ''' Reduce a word to its root to adequately compare with words from cluster'''
    w = WordNetLemmatizer().lemmatize(word) 
    return w.lower()
示例#19
0
def find_lemma_aspect(word):
    word = WordNetLemmatizer().lemmatize(word,'n')
    return word.lower()