예제 #1
0
def match(inp, filename):
    exclude = ["iit", "mandi", "!", "?", "-", ".", ",", "in", "at", "on"]

    for i in exclude:
        inp = inp.replace(i, "")

    data = pd.read_csv(filename)
    processedInp = lemmatize_sentence(inp.strip().lower())

    maxSim = -1
    ans = None
    bestQues = None
    for idx in range(data.shape[0]):
        processedQues = lemmatize_sentence(data.question[idx].strip().lower())
        sim = len(set(processedInp).intersection(set(processedQues)))
        if sim > maxSim:
            maxSim = sim
            bestQues = data.question[idx]
            ans = data.answers[idx]
    if maxSim == 0:
        return "Sorry, I cannot answer you... Some secrets are best not to be revealed :)"
    else:
        return (
            "Your question matched to the following question in database...\n"
            + bestQues + "\nAnswer: \n" + ans)
def calculate_jaccard_score(sen1, sen2):
  array1 = set(lemmatize_sentence(sen1) )
  array2 = set(lemmatize_sentence(sen2))

  intersection = array1.intersection(array2)
  union = array1.union(array2)
        
  return float(len(intersection)) / len(union)
예제 #3
0
def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    Simple Lesk is somewhere in between using more than the 
    original Lesk algorithm (1986) and using less signature 
    words than adapted Lesk (Banerjee and Pederson, 2002)
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
예제 #4
0
def cosine_lesk(context_sentence, ambiguous_word,
                pos=None, lemma=True, stem=True, hyperhypo=True,
                stop=True, context_is_lemmatized=False,
                nbest=False, from_cache=True):
    """
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop,
                                from_cache=from_cache)
    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in ss_sign.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        scores.append((cos_sim(context_sentence, signature), ss))

    scores = sorted(scores, reverse=True)
    return scores if nbest else scores[0][1]
예제 #5
0
def adapted_lesk(context_sentence, ambiguous_word,
                pos=None, lemma=True, stem=False, hyperhypo=True,
                stop=True, context_is_lemmatized=False,
                nbest=False, keepscore=False, normalizescore=False,
                from_cache=True):
    """
    This function is the implementation of the Adapted Lesk algorithm,
    described in Banerjee and Pederson (2002). It makes use of the lexical
    items from semantically related senses within the wordnet
    hierarchies and to generate more lexical items for each sense.
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = signatures(ambiguous_word, pos=pos, hyperhypo=hyperhypo, adapted=True,
                         remove_stopwords=stop, to_lemmatize=lemma,
                         remove_numbers=True, lowercase=True, to_stem=stem,
                         from_cache=from_cache)

    # Disambiguate the sense in context.
    context_sentence = context_sentence.split() if context_is_lemmatized else lemmatize_sentence(context_sentence)
    return compare_overlaps(context_sentence, ss_sign, nbest=nbest,
                            keepscore=keepscore, normalizescore=normalizescore)
예제 #6
0
def evaluate_algorithm(similarity_option, chunk):
    match = 0
    total = 0
    chunk_text = tree_to_list(chunk)
    surface_words, lemmas, morphy_poss = lemmatize_sentence(chunk_text, keepWordPOS=True)
    assert(len(lemmas) == len(chunk))
    for i in range(0, len(chunk)):
        semcor_word = chunk[i]
        # Skip stop-words and punctuation since neither they are in WordNet
        if not isinstance(semcor_word, nltk.tree.Tree):
            continue
        if not isinstance(semcor_word.label(), nltk.corpus.reader.wordnet.Lemma):
            # TODO: semcor_word.label() == 'such.s.00'
            continue
        # Skip named entities
        if semcor_word.label() == nltk.corpus.wordnet.lemma('group.n.01.group') and "') (NE " in semcor_word.pformat():
            continue

        context = [lemma for lemma in lemmas[max(0, i - 15):i+9]]
        lemma = lemmas[i]
        pos = morphy_poss[i]
        synset = max_similarity(context, lemma, pos=pos, option=similarity_option)

        if synset is None:
            # TODO: possibly this is bug, for example, "over-all" should be converted to "overall" before looking in WordNet database
            continue
        if synset is not None and semcor_word.label().synset() == synset:
            match += 1
        total += 1

    accuracy = match / total
    return match, total, accuracy
예제 #7
0
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')

        related_senses = list(
            set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
                ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms +
                ss_simto))

        signature = list([
            j for j in chain(
                *[synset_properties(i, 'lemma_names') for i in related_senses])
            if j not in EN_STOPWORDS
        ])

    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss] += signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
예제 #8
0
def disambiguate(sentence,
                 algorithm=simple_lesk,
                 context_is_lemmatized=False,
                 similarity_option='path',
                 keepLemmas=False,
                 prefersNone=True,
                 from_cache=True):
    tagged_sentence = []
    # Pre-lemmatize the sentnece before WSD
    if not context_is_lemmatized:
        surface_words, lemmas, morphy_poss = lemmatize_sentence(
            sentence, keepWordPOS=True)
        lemma_sentence = " ".join(lemmas)
    else:
        lemma_sentence = sentence  # TODO: Miss out on POS specification, how to resolve?
    for word, lemma, pos in zip(surface_words, lemmas, morphy_poss):
        if lemma not in stopwords:  # Checks if it is a content word
            try:
                wn.synsets(lemma)[0]
                if algorithm == original_lesk:  # Note: Original doesn't care about lemmas
                    synset = algorithm(lemma_sentence,
                                       lemma,
                                       from_cache=from_cache)
                elif algorithm == max_similarity:
                    synset = algorithm(lemma_sentence,
                                       lemma,
                                       pos=pos,
                                       option=similarity_option)
                else:
                    synset = algorithm(lemma_sentence,
                                       lemma,
                                       pos=pos,
                                       context_is_lemmatized=True,
                                       from_cache=from_cache)
            except:  # In case the content word is not in WordNet
                synset = '#NOT_IN_WN#'
        else:
            synset = '#STOPWORD/PUNCTUATION#'
        if keepLemmas:
            tagged_sentence.append((word, lemma, synset))
        else:
            tagged_sentence.append((word, synset))
    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    if prefersNone and not keepLemmas:
        tagged_sentence = [(word, None) if str(tag).startswith('#') else
                           (word, tag) for word, tag in tagged_sentence]
    if prefersNone and keepLemmas:
        tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else
                           (word, lemma, tag)
                           for word, lemma, tag in tagged_sentence]
    return tagged_sentence
예제 #9
0
def clean_text(text):
    clean_text = []
    text = text.lower()
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, ' ', text)  #remove HTML tag
    text = re.sub(r'[?|!|\'|"|#]', r'', text)
    text = re.sub(r'[.|,|)|(|\|/]', r' ', text)  #remove punctuation
    text = neg_pattern.sub(lambda x: negations_dic[x.group()], text)

    for word in text.split():
        if word not in stopwords.words('english'):
            word = lemmatize_sentence(word)
            word = word[0]
            clean_text.append(word)

    return (" ".join(clean_text))
def document_tf(doc):
    term_counts = dict()
    sent_tokens = sent_tokenize(doc)
    word_count = 0
    for sent in sent_tokens:
        word_tokens = lemmatize_sentence(sent)
        for word in word_tokens:
            if (word not in stop_words) and (word not in punctuators):
                word_count += 1  # keep track of total no. of words in the doc
                term_counts[word] = term_counts.get(
                    word, 0
                ) + 1  #dict.get returns default value specified if key not found

    for k in term_counts:
        term_counts[
            k] /= word_count  # convert raw occurence count to relative frequency
    return term_counts
예제 #11
0
파일: model.py 프로젝트: kurniawankp/NLP
 def preprocessing(self,sentence):
     
     clean_text = []
     sentence = sentence.lower()
     cleanr = re.compile('<.*?>')
     sentence = re.sub(cleanr, ' ', sentence) #menghilangkan HTML tag
     sentence = re.sub(r'[?|!|\'|"|#]',r'', sentence) 
     sentence =  re.sub(r'[.|,|)|(|\|/]',r' ',sentence)#menghilangkan punctuation
     sentence = self.neg_pattern.sub(lambda x: self.negations_dic[x.group()], sentence) #menyimpan kata negasi agar tidak hilang
     
     for word in sentence.split():
         if word not in stopwords.words('english'):
             word = lemmatize_sentence(word)
             word = word[0]
             clean_text.append(word)
             
     return (" ".join(clean_text))
예제 #12
0
def cosine_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem,
                                          hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]
예제 #13
0
파일: lesk.py 프로젝트: zhanghyy/pywsd
def simple_lesk(context_sentence: str,
                ambiguous_word: str,
                pos: str = None,
                lemma=True,
                stem=False,
                hyperhypo=True,
                stop=True,
                context_is_lemmatized=False,
                nbest=False,
                keepscore=False,
                normalizescore=False,
                from_cache=True) -> "wn.Synset":
    """
    Simple Lesk is somewhere in between using more than the
    original Lesk algorithm (1986) and using less signature
    words than adapted Lesk (Banerjee and Pederson, 2002)

    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None.
    :return: A Synset for the estimated best sense.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word, pos=pos)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signatures(ambiguous_word,
                                pos,
                                lemma,
                                stem,
                                hyperhypo,
                                stop,
                                from_cache=from_cache)
    # Disambiguate the sense in context.
    context_sentence = context_sentence.split(
    ) if context_is_lemmatized else lemmatize_sentence(context_sentence)
    return compare_overlaps(context_sentence,
                            ss_sign,
                            nbest=nbest,
                            keepscore=keepscore,
                            normalizescore=normalizescore)
예제 #14
0
 def recurse(key, d, ans, altans):
     key_lem = None
     if key in courses:
         key_lem = key.lower().strip()
     else:
         key_lem = lemmatize_sentence(key)[0]
     #print("Lemmatized key: " + key_lem)
     #print("Original Key: " + key)
     if key_lem in keywords:
         ans.add(d[key]['resp0nse'])
         for j in d:
             if j != key and j != 'resp0nse':
                 #try:
                 #   altans[j] += d[j]['resp0nse']
                 #except:
                 altans.update({j: d[j]['resp0nse']})
     if type(d[key]) == dict:
         for i in d[key]:
             if i != 'resp0nse':
                 ans, altans = recurse(i, d[key], ans, altans)
     return ans, altans
예제 #15
0
def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):

    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
예제 #16
0
파일: test.py 프로젝트: kurniawankp/NLP
def clean_sentence(sentence):
    negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
    neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
    clean_text = []
    sentence = sentence.lower()
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, ' ', sentence) #menghilangkan HTML tag
    sentence = re.sub(r'[?|!|\'|"|#]',r'', sentence) 
    sentence =  re.sub(r'[.|,|)|(|\|/]',r' ',sentence)#menghilangkan punctuation
    sentence = neg_pattern.sub(lambda x: negations_dic[x.group()], sentence) #menyimpan kata negasi agar tidak hilang
    
    for word in sentence.split():
        if word not in stopwords.words('english'):
            word = lemmatize_sentence(word)
            word = word[0]
            clean_text.append(word)
            
    return (" ".join(clean_text))
예제 #17
0
def disambiguate(sentence, algorithm=simple_lesk,
                 context_is_lemmatized=False, similarity_option='path',
                 keepLemmas=False, prefersNone=True, from_cache=True,
                 tokenizer=word_tokenize):

    tagged_sentence = []
    # Pre-lemmatize the sentnece before WSD
    if not context_is_lemmatized:
        surface_words, lemmas, morphy_poss = lemmatize_sentence(sentence, keepWordPOS=True, tokenizer=tokenizer)
        lemma_sentence = " ".join(lemmas)
    else:
        lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve?
    for word, lemma, pos in zip(surface_words, lemmas, morphy_poss):
        if lemma not in stopwords: # Checks if it is a content word
            if wn.synsets(lemma):
                if algorithm == original_lesk: # Note: Original doesn't care about lemmas
                    synset = algorithm(lemma_sentence, lemma, from_cache=from_cache)
                elif algorithm == max_similarity:
                    synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option)
                else:
                    synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True,
                                       from_cache=from_cache)
            else: # In case the content word is not in WordNet.
                synset = '#NOT_IN_WN#'
        else:
            synset = '#STOPWORD/PUNCTUATION#'
        if keepLemmas:
            tagged_sentence.append((word, lemma, synset))
        else:
            tagged_sentence.append((word, synset))
    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    if prefersNone and not keepLemmas:
        tagged_sentence = [(word, None) if str(tag).startswith('#')
                           else (word, tag) for word, tag in tagged_sentence]
    if prefersNone and keepLemmas:
        tagged_sentence = [(word, lemma, None) if str(tag).startswith('#')
                           else (word, lemma, tag) for word, lemma, tag in tagged_sentence]
    return tagged_sentence
예제 #18
0
def normalize_corpus(corpus):
    normalize_corpus=[]
    for doc in corpus:
        doc = re.sub(r"\b[A-Z\.]{2,}s?\b", "", doc)
        doc = remove_stopwords(doc,is_lower_case=True)
        doc = remove_accented_chars(doc)

        doc = expand_contractions(doc)

        doc = doc.lower()
            # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc,remove_digits=True)
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        from pywsd.utils import lemmatize_sentence
        doc=lemmatize_sentence(doc)
        doc=' '.join(doc)
        doc = remove_proper_nouns(doc)
        normalize_corpus.append(doc)
    return normalize_corpus
def cosine_lesk_inventario_estendido(context_sentence, ambiguous_word, \
    pos=None, lemma=True, stem=True, hyperhypo=True, \
    stop=True, context_is_lemmatized=False, \
    nbest=False, synsets_signatures=None, busca_ampla=False):
    """
	In line with vector space models, we can use cosine to calculate overlaps
	instead of using raw overlap counts. Essentially, the idea of using
	signatures (aka 'sense paraphrases') is lesk-like.
	"""

    # Ensure that ambiguous word is a lemma.
    if lemma:
        ambiguous_word = lemmatize(ambiguous_word)

    # If ambiguous word not in WordNet return None
    #if not wn.synsets(ambiguous_word):
    if not criar_inventario_des_wn(ambiguous_word, busca_ampla=busca_ampla):
        return None

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []

    chave_assinatura = "%s.%s.%s.%s.%s.%s" % (ambiguous_word, pos, lemma, stem,
                                              hyperhypo, busca_ampla)

    if not chave_assinatura in DesWordnet.cache_assinaturas:
        synsets_signatures = simple_signature(ambiguous_word,
                                              pos,
                                              lemma,
                                              stem,
                                              hyperhypo,
                                              busca_ampla=busca_ampla)

        DesWordnet.cache_assinaturas[chave_assinatura] = []

        for ss, signature in synsets_signatures.items():
            # Lowercase and replace "_" with spaces.
            signature = " ".join(map(str, signature)).lower().replace("_", " ")
            # Removes punctuation.
            signature = [i for i in Util.word_tokenize(signature) \
               if i not in string.punctuation]

            signature = Util.normalizar_ctx(signature,
                                            stop=stop,
                                            lematizar=lemma,
                                            stem=stem)

            scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

            DesWordnet.cache_assinaturas[chave_assinatura].append(
                (ss, signature))

    else:
        synsets_signatures = DesWordnet.cache_assinaturas[chave_assinatura]

        for ss, signature in synsets_signatures:
            scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

    if not nbest:
        return sorted(scores, reverse=True)[0][1]
    else:
        return [(j, i) for i, j in sorted(scores, reverse=True)]
예제 #20
0
def searchAPI(searchterm):
    #print(vocab)
    query = searchterm
    query_tokens = lemmatize_sentence(query) # lemmatize tokens to use as in vocabulary
    query_vector = []
    query_tf = {}
    total_query_vocab = 0
    print(query_tokens)
    for i in range(len(query_tokens)):
        tok = query_tokens[i]
        try:
            indexvalue = vocab.index(tok)
            query_vector.append(indexvalue)
            query_tf[indexvalue] = 1 + query_tf.get(indexvalue,0)
            total_query_vocab += 1
        except ValueError: # Token doesnt exist in vocab - ignored
            #print(tok, "does not exist in the vocabulary. - Ignoring")
            if tok not in stop_words:
                misspelled = list(spell.unknown([tok]))
                print("invalid -> ",tok)
                if(len(misspelled)==0):
                    print("trying synonyms")
                    syn = list()
                    for synset in wordnet.synsets(tok):
                        for lemma in synset.lemmas():
                            syn.append(lemma.name())    #add the synonyms
                    #print('Synonyms: ' + str(list(set(syn))))
                    found_word = False
                    new_word = ''
                    for word in syn:
                        if word in vocab:
                            new_word = word
                            found_word = True
                            break
                    if found_word:
                        #print('Synonym present in vocab -> ',new_word)
                        indexvalue = vocab.index(new_word)
                        query_vector.append(indexvalue)
                        query_tf[indexvalue] = 1 + query_tf.get(indexvalue,0)
                        total_query_vocab += 1
                    else:
                        print("None of the synonyms present in the vocabulary")
                else:
                    print("trying spelling correction")
                    candidate_list = spell.candidates(misspelled[0])
                    found_word = False
                    corrected_tok = ''
                    #print("Candidates -> ",candidate_list)
                    for word in candidate_list:
                        new_query_tokens = query_tokens
                        new_query_tokens[i] = word
                        new_query = ''
                        for j in range(len(new_query_tokens)):
                            new_query += new_query_tokens[j]+' '
                        new_query = new_query[0:len(new_query)-1]
                        new_lem_query = lemmatize_sentence(new_query)
                        lem_word = new_lem_query[i]
                        #print("word -> ",word, ", present in vocab ->", word in vocab)
                        #print("lem_word -> ",lem_word, ", present in vocab ->", lem_word in vocab)
                        #print("new query -> ",new_query)
                        print("new lemmatized query -> ",new_lem_query)
                        if lem_word in vocab:
                            corrected_tok = lem_word
                            found_word = True
                            break
                    if found_word:
                        #print("corrected -> "+corrected_tok)
                        indexvalue = vocab.index(corrected_tok)
                        query_vector.append(indexvalue)
                        query_tf[indexvalue] = 1 + query_tf.get(indexvalue,0)
                        total_query_vocab += 1
                    else:
                        print("couldnt find any word")

    print("Query as vocab indices:", query_vector)
    print()
    start_time = time.time() # Timer starts

    # First we obtain the list of all possible documents we actually need to search
    # This is a union of the docs in each query term's posting list
    # Not an intersection because we use cosine similarity and not boolean retrieval
    possible_docs = set()
    query_tf_vector = []

    for q in query_vector:
        possible_docs = possible_docs.union(posting_list[q].keys())
        query_tf_vector.append(query_tf[q]/total_query_vocab)
        # We also generate a TDF vector for the query. Does not make sense to scale with IDF

    # Run through each doc and generate the vector corresponding to the query terms
    # Compute the cosine similarities of it vs the TF vector of the query
    # Ties are broken by the magnitude of the vector - note that this is obtained by only considering the query terms
    # Plus these query term weights were scaled with relative TF, so a higher magnitude means the terms were more important
    doc_scores = {}
    for doc in possible_docs:
        doc_vector = []
        for q in query_vector:
            doc_vector.append(posting_list[q].get(doc,0))
        doc_scores[doc] = (cosine_similarity(doc_vector,query_tf_vector), norm(doc_vector))

    # Results are sorted
    sorted_results = sorted(doc_scores.items(), key=operator.itemgetter(1), reverse=True)

    end_time = time.time() # Timer ends as search portion is complete
    search_time = end_time - start_time

    ct = 0
    print("-------------- SEARCH RESULTS --------------")
    results={}
    results['Documents']={}
    for i in sorted_results:
        fname, rownum = file_dict[i[0]].split(' ')
        rownum = int(rownum[3:])
        search_res = lines[i[0]]
        search_res = search_res.split('\t')[2]
        results['Documents'][i[0]]={'Name': fname, 'Row': rownum, 'Score': i[1], 'Results': search_res}
        ct += 1
        if ct == 10:
            break
    results['Time']=end_time-start_time
    
    return results
예제 #21
0
def process(inpText):
    """This function lemmatizes
    the input sentence."""
    l = lemmatize_sentence(inpText, keepWordPOS=True)
    return l[1]
예제 #22
0
def remove_stop(sentence):

    words = lemmatize_sentence(sentence)
    words = [w for w in words if not w in stopwords]
    return ' '.join(words)
from sklearn.preprocessing import normalize
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering

### data reading + prep
lemmatizer = WordNetLemmatizer()

loc = ("tagged_comments.xlsx")
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(0)

#lemmatize the sentences
temp_comment = []
for i in range(sheet.nrows):
    temp_comment.append(" ".join(
        lemmatize_sentence(
            re.sub(r'[^a-zA-Z0-9 ]', '', str(sheet.cell_value(i, 0))))))

temp_speaker = []
for j in range(sheet.nrows):
    temp_speaker.append(str(sheet.cell_value(j, 1)))

temp_tag = []
for k in range(sheet.nrows):
    temp_tag.append(str(sheet.cell_value(k, 2)))

data_tuples = list(zip(temp_comment, temp_speaker, temp_tag))
data = pd.DataFrame(data_tuples, columns=["comments", "speaker", "tag"])

### SWEETNAM
sweetnam_comments = data[data['speaker'] == "Sweetnam"]
예제 #24
0
    '[^\w\s]', '')

#remove numeric
Descriptions.Descriptions = Descriptions.Descriptions.str.replace("\d+", "")

#Change to lowercase
Descriptions.Descriptions = Descriptions.Descriptions.str.lower()

#not sure I want to remove stop words yet
from nltk.corpus import stopwords
stop = stopwords.words('english')
Descriptions.Descriptions = Descriptions.Descriptions.apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))

Descriptions.Descriptions = Descriptions.Descriptions.apply(
    lambda x: " ".join(lemmatize_sentence(x)))

#Lets create a DTM
dtMatrix = cv.fit_transform(Descriptions.Descriptions).transpose().toarray()
#Inspect it
#print(dtMatrix.shape)
#Print the names of the skills
featurenames = cv.get_feature_names()
#print(featurenames)

#Create a tf-idf
tfidf = TfidfTransformer()
#Turn DTM into tf-idf matrix
tfidfMatrix = tfidf.fit_transform(dtMatrix).toarray()
#print(tfidfMatrix.shape)
del (dtMatrix)
예제 #25
0
from pywsd.utils import lemmatize_sentence

print(lemmatize_sentence("Really scared."))
예제 #26
0
# The actual data is also loaded to be display the search results
fileobj = open('tvnews_corpus.tsv', 'r')
lines = fileobj.readlines()
fileobj.close()

# Using a fixed query for testing. Remove and use free input later
# query = input("Query:")

#query = "Donald Trump accuses China of artificially creating climate change"
#query = "Climate change is very important"
#print("Input Query:", query)

while True:
    query = input("Query:")

    query_tokens = lemmatize_sentence(
        query)  # lemmatize tokens to use as in vocabulary
    query_vector = []
    query_tf = {}
    total_query_vocab = 0
    for tok in query_tokens:
        try:
            indexvalue = vocab.index(tok)
            query_vector.append(indexvalue)
            query_tf[indexvalue] = 1 + query_tf.get(indexvalue, 0)
            total_query_vocab += 1
        except ValueError:  # Token doesnt exist in vocab - ignored
            print(tok, "does not exist in the vocabulary. - Ignoring")

    print("Query as vocab indices:", query_vector)
    print()
    start_time = time.time()  # Timer starts
def lemmatize_words(sentence):
    from pywsd.utils import lemmatize_sentence
    return lemmatize_sentence(sentence)
예제 #28
0
            except: # In case the content word is not in WordNet
                synset = ''#'#NOT_IN_WN#'
        else:
            synset = ''#'#STOPWORD/PUNCTUATION#'
        if keepLemmas:
            tagged_sentence.append((word, lemma, pos_, synset))
    #    else:
    #        tagged_sentence.append((word, synset))
    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    #if prefersNone and not keepLemmas:
    #    tagged_sentence = [(word, None) if str(tag).startswith('#')
    #                       else (word, tag) for word, tag in tagged_sentence]
    #if prefersNone and keepLemmas:
    #    tagged_sentence = [(word, lemma, None) if str(tag).startswith('#')
    #                       else (word, lemma, tag) for word, lemma, tag in tagged_sentence]
    return tagged_sentence
    
    
if __name__=='__main__':
    
    sentence = "I went to the bank to get a loan."
    
    surface_words, lemmas, morphy_poss = lemmatize_sentence(sentence, keepWordPOS=True)
    
    print(surface_words, lemmas, morphy_poss)
    
    result = disambiguate('I went to the bank to deposit my money', \
        keepLemmas=True, prefersNone=False)
    
    print(result)