예제 #1
0
  def validate(self, tag, media):
    # check with wordnet
    # if synset continue else return -1
    # check category and decide if verify
    # use google vision api to verify
    # result: 1 -> good (keep image) ; 0 -> bad (discard image) ; -1 -> cannot validate (keep)
    tag = singularize(tag).lower()
    synset = wordnet.synsets(tag, pos=NOUN)
    if not synset:
      return -1
    category = synset[0].lexname
    if self.VALIDATE_CATEGORY == 'all':
      pass
    elif category in self.VALIDATE_CATEGORY:
      pass
      # do not return yet
    else:
      return -1 # not all and cannot be validated

    img = requests.get(media)
    gImage = google.cloud.vision.types.Image(content=img.content)
    response = self.vision_client.label_detection(image=gImage)
    labels = map(lambda d: d.description if d.score > 0.9 else None, response.label_annotations)  
    # check if tag in the detected labels with a good probability (score)
    if tag in labels:
      return 1

    # compare synonyms
    synonyms = synset[0].synonyms
    # check if any synonym in labels
    for synonym in synonyms:
      if singularize(tag).lower() in labels:
        return 1
    return 0
def unify_query(query):
    """ 
    a peek of sorts .. 
    param: list of Word objs 
    return: synset entry from wn
    """
    
    
    
    #build query from 
    query = build_wn_query(query)
    print 'wordnet query: {0}'.format(query)
  
    s = wordnet.synsets(singularize(query), pos=wordnet.NOUN)
    
    if len(s) == 0:
        
        #this is a bit hacky.. it's based on the assumption, if it fails, it may be a two word NN 
        #i.e. thrill ride fails, ride doesn't 
        print 'no entry for {0}..'.format(query) 
        
        s = wordnet.synsets(singularize(query.split()[1]), pos=wordnet.NOUN)
        if len(s) == 0:
             print 'no entry for {0}'.format(query.split()[1]) 
    
        
    
    return s
예제 #3
0
 def getKeywords(self):
     """
     Extract keywords using POS tagging
     :return: Query keywords
     """
     nouns = []
     if len(self.sentences) == 1:
         s = re.sub('[' + string.punctuation + ']', '', self.sentences[0])
         self.r.extract_keywords_from_text(s)
         rp = self.r.get_ranked_phrases()
         for n in rp:
             tokens = nltk.tokenize.word_tokenize(n)
             if len(tokens) == 1:
                 item, tag = nltk.pos_tag(tokens)[0]
                 if 'NN' in tag:
                     if len(item) > 1:
                         if singularize(item) not in nouns and pluralize(
                                 item) not in nouns:
                             nouns.append(item)
             else:
                 nouns.append(n)
         return nouns
     for s in self.sentences:
         s = re.sub('[' + string.punctuation + ']', '', s)
         tokens = nltk.tokenize.word_tokenize(s)
         tagged = nltk.pos_tag(tokens)
         final_nouns = []
         for item, t in tagged:
             if 'NN' in t:
                 if len(item) > 1:
                     if singularize(item) not in final_nouns and pluralize(
                             item) not in final_nouns:
                         final_nouns.append(item)
         nouns.append(final_nouns)
     return nouns
예제 #4
0
    def getAnonymizationStructure(self, words):
        # deal with first word
        if singularize(words[0].lower()) in self.ind.stop_words or singularize(
                words[0].lower()) in self.ents:
            words[0] = words[0].lower()
        self.normalizeTables(words)

        spans = {}
        i = 0
        while i < len(words):
            for w in range(3, 0, -1):
                at_most_2 = {}
                span = ' '.join(words[i:i + w]).replace('#', ' ')
                exact = True
                if self.allInitCaps(span):
                    exact = False
                if not self.allInitCaps(span):
                    continue
                (docs, typs) = self.ind.getKey(span,
                                               exact=exact,
                                               case=(len(span) <= 2))
                if docs:
                    spans[i] = {'width': w, 'docs': [], 'types': []}
                    for j in range(0, len(docs)):
                        if at_most_2.setdefault(
                                typs[j],
                                0) < 2:  #  typs[j] not in spans[i]['types']:
                            spans[i]['types'].append(typs[j])
                            spans[i]['docs'].append(docs[j])
                            at_most_2[typs[j]] += 1

                    i = i + w - 1  # we have finished the span
                    break
            i += 1
        return spans
예제 #5
0
 def __init__(self, doc_name, n):
     self.freq = {}
     path = "C:/Users/ARKAZA KUMARI/Desktop/Mini Project/Mini Project/Source Code/" + doc_name
     line = ""
     with open(path) as f:
         line = f.readline()
         line = line.split(" ")
         line = [word.lower() for word in line if word not in ["", " "]]
     stop_words = set(stopwords.words('english'))
     new_stopwords = ['also', 'may', 'must', 'since', 'could', 'whether']
     new_stopwords_list = stop_words.union(new_stopwords)
     filtered_words = [
         word for word in line if word not in new_stopwords_list
     ]
     writepath = "C:/Users/ARKAZA KUMARI/Desktop/Mini Project/Mini Project/Source Code/words" + str(
         n) + ".txt"
     with open(writepath, 'w') as f:
         for i in filtered_words:
             #print(i)
             i = re.sub(r'[^\w]', '', i)
             count = self.freq.get(singularize(i), 0)
             self.freq[singularize(i)] = count + 1
             #f("%s\n" % i)
             f.write(singularize(i) + "\n")
     f.close()
예제 #6
0
    def singularize(self, word):
        '''
        Given a base-form of noun, return a singular form
        (For Noun only)

        Args:
            word (str): base-form of noun

        Raises:
            ValueError: [description]
            ValueError: [description]

        Returns:
            str: singular form of noun
        '''
        if word in self._word2index:
            return singularize(word)
        else:
            try:
                base_form_word = lemma(word)
                if base_form_word in self._word2index:
                    return singularize(base_form_word)
                else:
                    raise ValueError(
                        "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary"
                        .format(word, base_form_word))
            except:
                raise ValueError(
                    "Can not found base-form for '{}'".format(word))
예제 #7
0
 def inject(self, title, word_pair):
     for (i, slot), word in zip(title.slots, word_pair):
         word = word.replace("_", " ").title()
         if slot == 'NOUN':
             title.inject(singularize(word), slot, i)
         elif slot == 'NOUNS':
             title.inject(pluralize(singularize(word)), slot, i)
         else:
             title.inject(word, slot, i)
예제 #8
0
def approx_match(label, gold_label, use_includes=False):
    if label == gold_label:
        return True
    # Approximate matching strategy from Zesch and Gurevych (2009).
    # Following their human validation test, we implement the MORPH and
    # INCLUDES matching strategies.
    singularized_label_tokens = [singularize(token) for token in label.split()]
    singularized_gold_label_tokens = [singularize(token) for token in gold_label.split()]
    if use_includes:
        return contains_sublist(singularized_label_tokens, singularized_gold_label_tokens)
    else:
        return singularized_label_tokens == singularized_gold_label_tokens
def pluralizationError(text, nlp, correctFlag=False):
    '''
    Purpose: To check for pluralization error. 
             Additionally, it returns corrected sentence.
             
    Parameters: text: string
                    A string of text-single or a paragraph.
                    
                correctFlag:boolean 
                   True or False
                    
    Returns: count: integer  
             text: Corrected sentence. (If correctFlag is True)
    '''

    doc = nlp(text)
    count = 0
    text = ""
    for s in doc.sentences:
        for i in range(len(s.words)):
            if (i != len(s.words) - 1) and (s.words[i].xpos == "NN"
                                            or s.words[i].xpos == "NNP"):
                if s.words[i + 1].xpos in ["VB", "VBP"]:
                    count += 1
                    text += pluralize(s.words[i].text) + " "
                else:
                    text += s.words[i].text + " "
            elif (i != len(s.words) - 1) and (s.words[i].xpos == "NNS"
                                              or s.words[i].xpos == "NNPS"):
                if s.words[i + 1].xpos == "VBZ":
                    text += singularize(s.words[i].text) + " "
                else:
                    text += s.words[i].text + " "
            elif (i != len(s.words) - 1) and s.words[i].xpos == "CD":
                if s.words[i].text == "1" or s.words[i].text == "one":
                    if s.words[i +
                               1].xpos == "NNS" or s.words[i +
                                                           1].xpos == "NNPS":
                        count += 1
                        s.words[i + 1].text = singularize(s.words[i + 1].text)
                        text += s.words[i].text + " "
                else:
                    if s.words[i + 1].xpos == "NN" or s.words[i +
                                                              1].xpos == "NNP":
                        count += 1
                        s.words[i + 1].text = pluralize(s.words[i + 1].text)
                        text += s.words[i].text + " "
            else:
                text += s.words[i].text + " "
    if correctFlag == True:
        return count, text
    else:
        return count
예제 #10
0
def get_related_noun_or_not(noun, d=True):
	w = wordnet.synsets(noun)
	if w:
		w = w[0]
		w1 = w.hyponyms()
		w2 = w.hypernyms()
		if w1 + w2:
			nw = random.choice(w1 + w2)
			if nw and nw.senses:
				return nw.senses[0]
	elif wordnet.synsets(singularize(noun)) and d:
		return get_related_noun_or_not(singularize(noun, False))
	return noun
예제 #11
0
def get_document_topics(doc, name):
  lda = gensim.models.ldamodel.LdaModel.load(name + '.lda')
  englishStopWords = get_stopwords('english', name)
  text = [singularize(word) for word in doc.lower().split() if singularize(word) not in englishStopWords and word.isalpha() and len(word) > 1]
  dictionary = gensim.corpora.Dictionary.load(name + '.dict')
  document_topics = lda.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.05)
  if len(document_topics) > 0:
    primary_topic_tuple = max(document_topics, key=lambda x:x[1])
    topic_terms = lda.show_topic(primary_topic_tuple[0])
    print topic_terms
    return document_topics, topic_terms
  else:
    return [], ''
예제 #12
0
def get_related_or_not(word, d=True, pos='NN'):
    w = wordnet.synsets(word, pos=pos)
    if w:
        w = w[0]
        w1 = w.hyponyms()
        w2 = w.hypernyms()
        if w1 + w2:
            nw = random.choice([w] + w1 + w2)
            if nw and nw.senses:
                return nw.senses[0]
    elif wordnet.synsets(singularize(word)) and d:
        return get_related_or_not(singularize(word, False, pos))
    return word
예제 #13
0
def word_normalize(text):
    normal_form = parse_text(text)

    # tags for detection part of speech
    Nouns_tags = ["NNS", "NNPS"]
    Adjective_tags = ["JJR", "JJS"]
    Verb_tags = ["VBD", "VBG", "VBN", "VBP", "VBZ"]

    buff_string = []  # buffer list for compose internal box

    # to make noun singular form. The func I taken from standard functions in pattern.in module
    # in that part of code I wish compare tag NNS, NNPS and object from word class.
    for word in normal_form:
        if word.tag in Nouns_tags:
            buff_Noun = singularize(word.string,
                                    pos=NOUN)  # make singular from plural
            buff_Noun = str(buff_Noun)
            buff_string.append(buff_Noun)
            return buff_string  # how can I return part of speech? Or I need one func just for detecting tag part of speech?
            # but that not effective
        else:
            buff_string.append(str(word.string))
            return buff_string

    # make the basic adjective form.
    # in that part of code I wish compare tags JJR, JJS and object from word class
    # and make word singular form, after that use func  not_comperative_superlative to make
    # adj basic form.
    buff_string1 = buff_string
    for word in normal_form:
        if word.tag in Adjective_tags:
            buff_Adj = singularize(word.string,
                                   pos=ADJECTIVE)  # make singular from plural
            buff_Adj = str(buff_Adj)
            buff_string1.append.not_comperative_superlative(buff_Adj)
            return buff_string
        else:
            buff_string.append(str(word.string))
            return buff_string

    # verb to infinitive form. The func I taken from standard functions in pattern.in module
    buff_string2 = buff_string  # make a copy of entered string
    for word in normal_form:  # go thru the string word by word
        if word.tag in Verb_tags:  # looking for only verb
            buff_Vb = str(word.string)
            buf = lemma(buff_Vb)
            buff_string2.append(buf)
            return buff_string2
        else:
            buff_string2.append(str(word.string))
            return buff_string
예제 #14
0
def wordListPrint(fileName):
    """
    Remove all character names, plurality, and stop words
    """
    wordList = []
    characterList = characterBuilder(fileName)
    swords = stopwords.words("english")
    for word in corpusBuilder(fileName):
        word = word.strip(".:,()?!;[]")
        singularize(word)
        if word.lower() not in characterList and word.lower() not in swords and len(word) > 1:
            print "%s\t%s" % (word.lower(), 1)
        else:
            continue
예제 #15
0
def stem_word(word):
    try:
        if word.endswith("s"):
            if singularize(word) in nltk_words:
                return singularize(word)
            else:
                return word
        if word.endswith("d") or word.endswith('ing'):
            if conjugate(word) in nltk_words:
                return conjugate(word)
            else:
                return word
    except:
        return word
    return word
예제 #16
0
def approx_match(label, gold_label, use_includes=False):
    if label == gold_label:
        return True
    # Approximate matching strategy from Zesch and Gurevych (2009).
    # Following their human validation test, we implement the MORPH and
    # INCLUDES matching strategies.
    singularized_label_tokens = [singularize(token) for token in label.split()]
    singularized_gold_label_tokens = [
        singularize(token) for token in gold_label.split()
    ]
    if use_includes:
        return contains_sublist(singularized_label_tokens,
                                singularized_gold_label_tokens)
    else:
        return singularized_label_tokens == singularized_gold_label_tokens
예제 #17
0
def wordListPrint(fileName):
    """
    Remove all character names, plurality, and stop words
    """
    wordList = []
    characterList = characterBuilder(fileName)
    swords = stopwords.words('english')
    for word in corpusBuilder(fileName):
        word = word.strip('.:,()?!;[]')
        singularize(word)
        if word.lower() not in characterList and word.lower(
        ) not in swords and len(word) > 1:
            print '%s\t%s' % (word.lower(), 1)
        else:
            continue
예제 #18
0
def wordListCleaner(fileName):
    """
    Remove all character names, plurality, and stop words
    """
    wordList = []
    characterList = characterBuilder(fileName)
    swords = stopwords.words('english')
    for word in corpusBuilder(fileName):
        word = word.strip('.:,()?!;[]')
        singularize(word)
        if word.lower() not in characterList and word.lower() not in swords and len(word) > 1:
            wordList.append(word.lower())
        else:
            continue
    return wordList
예제 #19
0
def wordListCleaner(fileName):
    """
    Remove all character names, plurality, and stop words
    """
    wordList = []
    characterList = characterBuilder(fileName)
    swords = stopwords.words('english')
    for word in corpusBuilder(fileName):
        word = word.strip('.:,()?!;[]')
        singularize(word)
        if word.lower() not in characterList and word.lower(
        ) not in swords and len(word) > 1:
            wordList.append(word.lower())
        else:
            continue
    return wordList
예제 #20
0
def inject_sub_nn(sent_i, e_config):
    target_indices = []
    for i, w_i in enumerate(sent_i):
        if w_i['tag'] in ('NN', 'NNS'):
            target_indices.append(i)
    if target_indices:
        target_index = target_indices[random.randint(0,
                                                     len(target_indices) - 1)]
        target_token = sent_i[target_index]['form']
        target_tag = sent_i[target_index]['tag']

        new_token = ""
        new_tag = ""
        if target_tag == "NN":
            new_token = pluralize(target_token)
            new_tag = "NNS"
        elif target_tag == "NNS":
            new_token = singularize(target_token)
            new_tag = "NN"
        else:
            raise
        sent_i[target_index]['form'] = str(new_token)
        sent_i[target_index]['tag'] = new_tag
        sent_i[target_index]['ctag'] = new_tag
    else:
        pass
    return sent_i
예제 #21
0
def process(line):
    # replace some known utf-8 chars with ascii
    line = re.sub("\xe2\x80\x99", "x", line)  # U+2019 (right single quotation mark)
    line = re.sub("\xe2\x80\x93", "-", line)  # U+2013 (EN-DASH)
    # remove the rest of the non-ascii chars
    line = re.sub(r'[^\x00-\x7F]+', ' ', line)

    sentences = nltk.tokenize.sent_tokenize(line)

    # print('---------------')
    tags = set()
    for sentence in sentences:
        # words
        all_words = [singularize(w).capitalize() for w in nltk.tokenize.word_tokenize(sentence)]
        words = {remove_nonalpha(w).lower() for w in all_words if accept_word(w)}
        # search solr
        for word in words:
            # print(word)
            tags.update(query(word))

        # bigrams
        all_bigrams = nltk.bigrams(all_words)
        bigrams = {b for b in all_bigrams if accept_word(b[0]) and accept_word(b[1])}
        for bigram in bigrams:
            b = '%s_%s' % (remove_nonalpha(bigram[0]), remove_nonalpha(bigram[1]))
            b = b.lower()
            # print('>>>>>>>>> %s' % b)
            tags.update(query(b))

    return ",".join(tags).encode('utf-8')
def conceptnet_relatedness(subject, candidates, object):
    base_score = call_cp_api(subject, object)

    pred_subject = subject

    # print(base_score)
    # Is there any other label in the ranking making more sense?

    for o_class, confidence in candidates.items():

        f_class, _ = formatlabel(o_class)

        if f_class == subject:
            continue  # Skip the object itself

        score = call_cp_api(f_class, object)

        if score > base_score:
            base_score = score
            pred_subject = o_class

    print("CONCEPTNET: Within the ranking, the most likely subject is %s" %
          pred_subject)
    if singularize(pred_subject) == pred_subject:
        # Re-format back for evaluation
        pred_subject = pluralize(pred_subject)

    pred_subject = reverse_map(pred_subject)

    return pred_subject.replace('_', '-'), base_score
예제 #23
0
    def perturb(self, word, tag):
        res = ""
        # pertube verb
        if 'V' in tag:
            vs = pe.lexeme(word)
            res = choice(vs)

            while (res == word or len(res) > len(word)) and (vs[0] != word):
                res = choice(vs)
            if vs[0] == word:
                res = vs[1]

        #pertube plural/singlar noun
        if 'NNS' == tag:
            res = pe.singularize(word)
            if res == word:
                res = word[:-1]

        if len(res) > 0:
            return (res, word, (0, len(res)))
        else:
            #if the perturbed result is empty, we just randomly remove some chars in the word
            removeLen = randint(1, min(len(word) - 1, 3))
            lenw = len(word)
            removestart = lenw - removeLen
            return (word[:removestart] + word[removestart + removeLen:], word,
                    (0, lenw - removeLen))
예제 #24
0
    def do_flower(self, i, j):
        """Process finding a flower and possibly doing something with it"""

        # Get a random color and flower name
        color = random.choice(self.JSON['colors'])['color']
        flower = singularize(random.choice(self.JSON['flowers']))

        # Print them
        self.TEMP += "There was a beautiful " + color + " " + flower + " there. "
        self.TEMP += "It smelled like " + pluralize(
            random.choice(self.JSON['fruits'])) + "."

        # Put a square on the map to mark the flower
        self.IMAGE.filledRectangle((i * 15 + 4, j * 15 + 4),
                                   (i * 15 + 11, j * 15 + 10),
                                   self.COLORS['purple'])

        # Is the narrator keeping this flower?
        if random.randrange(100) < 10:
            self.TEMP += " I picked it"

            if self.FLOWERS:
                self.TEMP += " and added it to the rest of my bouquet"

            self.TEMP += "."

            self.FLOWERS.append({'color': color, 'flower': flower})

        # Does the narrator eat this flower instead?
        elif random.randrange(100) < 5:
            self.TEMP += " For some reason I ate it. It tasted " + random.choice(
                self.TASTES) + "."

        self.TEMP += "\n"
        self.THEN = False
예제 #25
0
파일: views.py 프로젝트: aczapata/twitter
def transform(term, term_modified):
    if term == 'VBZ' or term == 'VBP' or term == 'VBN' or term == 'VBG' or term == 'VBD':
        return lemmatizer.lemmatize(''.join(term_modified), 'v')
    elif term == 'NNS':
        return singularize(''.join(term_modified))
    else:
        return term_modified
예제 #26
0
def pos_all(word):
    rlist = []
    _rtense = ('infinitive', 'present', 'past', 'future')
    _rperson = (1, 2, 3)
    _rnumber = ('singular', 'plural')
    _rmood = ('indicative', 'imperitive', 'conditional', 'subjuntive')
    _raspect = ('imperfective', 'perfective', 'progressive')
    for rtense in _rtense:
        for rperson in _rperson:
            for rnumber in _rnumber:
                for rmood in _rmood:
                    for raspect in _raspect:
                        item = conjugate(word,
                                         tense=rtense,
                                         person=rperson,
                                         number=rnumber,
                                         mood=rmood,
                                         aspect=raspect,
                                         negated=False)
                        if item not in rlist:
                            rlist.append(item)

    print bcolors.Magenta + "All pos of " + word
    print_list(rlist, 4)
    print "Singluar    : " + singularize(
        word) + "			Plural      : " + pluralize(word)
    print "Comparative : " + comparative(
        word) + " 			Superlative : " + superlative(word)
예제 #27
0
def replace_sql(sql, select_clause, from_clause, where_clause):
   """Perform replacement on skeleton SQL"""

   sql = sql.substitute(columns=select_clause, tables=from_clause, where=where_clause)

   # Build a GROUP BY clause if the SELECT has a COUNT 
   group_by = ''
   print str(select_clause.find('COUNT')) + ' :: ' + select_clause
   if select_clause.find('COUNT') >= 0:
      group_by = ' GROUP BY ' + singularize(from_clause.strip().split(' ')[0]) + '_type\n'

   sql = sql + group_by

   # Build an ORDER BY clause 50% of the time
   order_by = ''
   if random.choice(range(1, 100)) < 50:
      if select_clause.split(' ')[0] == '*':
         order_by = ' ORDER BY ' + random.choice(from_clause.strip().split(' ')[0:]) + '_name'
      elif select_clause.split(' ')[0] != 'COUNT(*)':
         order_by = ' ORDER BY ' + select_clause.split(' ')[0].replace(',', '')

      if len(order_by):
         order_by = order_by + random.choice(['', ' ASC', ' DESC'])

   # But only attach the ORDER BY if it keeps us under 140 characters
   if len(sql + order_by) < 140:
      sql = sql + order_by

   return sql.strip() + ';'
예제 #28
0
파일: views.py 프로젝트: aczapata/twitter
def transform(term, term_modified):
    if term == 'VBZ' or term == 'VBP' or term == 'VBN' or term == 'VBG' or term == 'VBD':
        return lemmatizer.lemmatize(''.join(term_modified), 'v')
    elif term == 'NNS':
        return singularize(''.join(term_modified))
    else:
        return term_modified
예제 #29
0
def cleanData(data_matrix):
    printable = set(string.printable)
    prepositions = ["is", "a", "at", "the", "which", "on ", "to"]

    for line in data_matrix:
        line[1] = line[1].replace("UPDATE 5-", "")
        line[1] = line[1].replace("UPDATE 1-", "")
        line[1] = line[1].replace("UPDATE ", "")
        line[1] = line[1].replace("UPDATE: ", "")

        line[1] = line[1].replace("Companies", "")
        line[1] = line[1].replace("Insight - ", "")
        line[1] = line[1].replace(" - Quick Facts", "")
        line[1] = line[1].replace(" ...", "")

        line[1] = filter(lambda x: x in printable, line[1])
        line[1] = line[1].lower()
        line[1] = line[1].translate(None, string.punctuation)

        # for prep in prepositions:
        # 	line[1] = line[1].replace(prep, "")

        sentence_array = nltk.word_tokenize(line[1])
        # for pr in prepositions:
        # 	try:
        # 		sentence_array.remove(pr)
        for i in range(len(sentence_array)):
            #sentence_array[i] = str(WordNetLemmatizer().lemmatize(sentence_array[i], 'v'))
            sentence_array[i] = str(singularize(sentence_array[i]))

        line.append(sentence_array)
예제 #30
0
def change_pluralization(token):
    singularForm = singularize(token)
    pluralForm = pluralize(token)
    if token == singularForm:
        return pluralForm
    else:
        return singularForm
def getSynonyms(word, part):
    synonyms = []
    wordToTry = lemma(word) if part[0] == 'V' else word
    synList = dictionary.synonym(wordToTry)
    if synList is None:
        return [word]
    for syn in synList:
        if " " not in syn:
            if part == "VB" or part == "VBP":
                synonyms.append(lemma(syn))
            elif part == "VBD" and len(lexeme(syn)) > 3:
                synonyms.append(lexeme(syn)[3])
            elif part == "VBG" and len(lexeme(syn)) > 0:
                synonyms.append(lexeme(syn)[0])
            elif part == "VBN" and len(lexeme(syn)) > 3:
                synonyms.append(lexeme(syn)[-1])
            elif part == "VBZ" and len(lexeme(syn)) > 1:
                synonyms.append(lexeme(syn)[1])
            elif part == "NN" and syn[-2:] != "ss":
                synonyms.append(singularize(syn))
            elif part == "NNS":
                synonyms.append(pluralize(syn))
            else:
                synonyms.append(syn)
    return list(set(synonyms))
예제 #32
0
    def updateTerms(self, line, w2vmodel):
        list_term = line.split('_')
        list_result = []

        whitelist = set(
            ['win', 'won', 'most', 'biggest', 'largest', 'fastest'])
        blacklist = set(['give', 'also'])
        stoplist = set(stopwords.words('english'))

        for term in list_term:
            if term in blacklist:
                continue
            if term not in whitelist and term in stoplist:
                continue
            # find
            lem = lemma(term)
            sing = singularize(term)

            if term in w2vmodel.vocab:
                list_result.append(term)
            elif lem in w2vmodel.vocab:
                list_result.append(lem)
            elif sing in w2vmodel.vocab:
                list_result.append(sing)
        return list_result
예제 #33
0
def conjugate_noun(noun, pos):
    if pos == "NNS" or pos == "NNPS":
        return str(ptn.pluralize(noun))
    elif pos == "NN" or pos == "NNP":
        return str(ptn.singularize(noun))
    else:
        return noun
예제 #34
0
def synonyms(data):
    augment_n = 10
    data_dict = dict((key,[val]) for val,key,_ in data)

    is_plural = lambda word: singularize(word) <> word
    stops = set(stopwords.words('english') + ['l'])

    for disease in data:
        for _ in range(augment_n):
            new_facts_list = []
            for fact in disease[0]:
                new_fact = fact[:]
                for k,word in enumerate(fact):
                    if word not in stops:
                        syn = wordnet.synsets(word)
                        if syn:
                            random_syn = syn[0]              
                            random_lemma = random.choice(random_syn.lemma_names())
                            random_lemma = pluralize(random_lemma) if is_plural(word)\
                                                else random_lemma
                            random_lemma = random_lemma.lower()
                            random_lemma = random_lemma.replace('_',' ')
                            random_lemma = random_lemma.replace('-',' ')
                            if ' ' in random_lemma:
                                continue
                            new_fact[k] = random_lemma
                new_facts_list.append(new_fact)
            #print new_facts_list
            data_dict[disease[1]].append(new_facts_list[:])
    return data_dict
def phrase_search(q, positional_index):
    q = q.strip("'")
    q = q.strip()  # to remove white space in the phrase query
    phrase_query = []
    for val in q.split(" "):
        phrase_query.append(singularize(val))
    combine_doc = {}
    for index in range(0, len(phrase_query)):
        if(len(combine_doc) == 0):
            combine_doc = positional_index[str(
                phrase_query[index])][1]

        else:
            match = {}
            print(positional_index[phrase_query[index]][1])
            for key, value in combine_doc.items():
                for key1, value2 in positional_index[phrase_query[index]][1].items():
                    print("1")
                    if(key == key1):
                        print("2")
                        for position in value:
                            for position2 in value2:
                                if (position+1 == position2):
                                    match[key] = set()
                                    match[key].add(position2)
            combine_doc = match
    relevent_docs = set()
    for keys in combine_doc:
        relevent_docs.add(int(keys))
    return relevent_docs
예제 #36
0
파일: rank.py 프로젝트: muyun/dev.nlp
def _isplural(w):
        word = w.lower()
        singula = singularize(word)
        if singula == word:
            return False
        else:
            return True
예제 #37
0
def getdata():
    inp =raw_input('Enter the topic:')
    while 1:
        try:
            topic= wikipedia.page(inp)
            content1=topic.content #fetches content from the wikipedia webpage in the form of text
            break
        except wikipedia.exceptions.DisambiguationError as e:
            c=1
            for i in e.options:
                print str(c) + '.' + i
                c+=1
            choice=input('Enter your choice:')
            inp=e.options[choice-1]
            topic= wikipedia.page(inp)
            content1=topic.content
            break

##    1=re.sub("[\(\[].*?[\)\]]", "", summ)
    content1=content1.encode('ascii','ignore')
    content1=content1.lower()
    tokens=nltk.word_tokenize(content1)
    tagged=nltk.pos_tag(tokens)
    freqdic={}
    for i in tagged:
        word=singularize(i[0])
        if i[1] in ['NN','NNS','NNP','NNPS','FW'] and not (word in inp.lower().split()) and word.isalpha() : #iterates through the text and filters out for nouns and the various forms of nouns
            if word in freqdic:                                                                             #makes sure the dictionary doesn't contain the word itself, also no pronouns and numbers
                freqdic[word]+=1
            else:
                freqdic[word]=1
    return freqdic
def conjugate_noun(noun, pos):
    if pos=="NNS" or pos =="NNPS":
        return str(pluralize(noun))
    elif pos=="NN" or pos =="NNP":
        return str(singularize(noun))
    else:
        return noun
예제 #39
0
def process_agent_output(answer_template, noun, nouns, noun_topics,
                         answer_sentiment):
    agent_output = answer_template.answer
    temp_nouns = nouns
    #print(agent_output, nouns, noun_topics, (nouns))
    if answer_template.fetch_count > 0 and noun_topics != None and len(
            noun_topics) > 0:
        #print(noun_topics)
        if question_sentiment in sentiment_opt_pos:
            temp_nouns = topic_favorites[noun_topics[0]]
            #like_memory.loc[like_memory['sentiment'] > 0.5 && like_memory['topic'] == noun_topics[0]].sample().subject
        elif question_sentiment in sentiment_opt_neg:
            temp_nouns = topic_dislike[noun_topics[0]]
        sing_noun = singularize(noun)
        plural_noun = pluralize(noun)
        if sing_noun in temp_nouns: temp_nouns.remove(sing_noun)
        elif plural_noun in temp_nouns: temp_nouns.remove(plural_noun)

    #replace nouns
    for i in range(1, answer_template.fetch_count + 1):
        temp = "noun_" + str(i)

        agent_output = agent_output.replace(wildcards[temp], temp_nouns[i - 1])

    if answer_template.use_noun:
        agent_output = agent_output.replace(wildcards["noun"], noun)
    if answer_template.use_sentiment:
        agent_output = agent_output.replace(wildcards["sentiment"],
                                            question_sentiment)
    agent_output = agent_output.replace(wildcards["agent_sentiment"],
                                        answer_sentiment)
    #print(agent_output)
    return agent_output
예제 #40
0
   def do_flower(self, i, j):
      """Process finding a flower and possibly doing something with it"""

      # Get a random color and flower name
      color  = random.choice(self.JSON['colors'])['color']
      flower = singularize(random.choice(self.JSON['flowers']))

      # Print them
      self.TEMP += "There was a beautiful " + color + " " + flower + " there. "
      self.TEMP += "It smelled like " + pluralize(random.choice(self.JSON['fruits'])) + "."

      # Put a square on the map to mark the flower
      self.IMAGE.filledRectangle((i * 15 + 4, j * 15 + 4), (i * 15 + 11, j * 15 + 10), self.COLORS['purple'])

      # Is the narrator keeping this flower?
      if random.randrange(100) < 10:
         self.TEMP += " I picked it"

         if self.FLOWERS:
            self.TEMP += " and added it to the rest of my bouquet"
 
         self.TEMP += "."

         self.FLOWERS.append({'color': color, 'flower': flower})

      # Does the narrator eat this flower instead?
      elif random.randrange(100) < 5:
         self.TEMP += " For some reason I ate it. It tasted " + random.choice(self.TASTES) + "."

      self.TEMP += "\n"
      self.THEN = False
예제 #41
0
    def _transform_word(self, word, pos, less, more):
        """transforms a word to be less less and more more

        :param word: word to transform
        :type word: str

        :param pos: part of speech of the word
        :type pos: str

        :param less: list of 'less' words
        :type less: list

        :param more: list of 'more' words
        :type more: list

        :returns: transformed word
        :rtype: str
        """

        new_word = self._get_similar_word(word, less, more)
        new_pos = en.tag(new_word)[0][1]

        if (pos[:2] != new_pos[:2]) or word == new_word:
            return word

        # handle noun
        if pos.startswith('NN'):

            # pluralization
            if pos.endswith('S') and not new_pos.endswith('S'):
                new_word = en.pluralize(new_word)

            elif not pos.endswith('S') and new_pos.endswith('S'):
                new_word = en.singularize(new_word)

            # capitalization
            if word[0].isupper():
                new_word = new_word[0].upper() + new_word[1:]
            else:
                new_word = new_word.lower()

        # handle verb
        elif pos.startswith('VB'):

            tense, person, number = en.tenses(word)[0][:3]

            # conjugation
            conjugated = en.conjugate(new_word,
                                    tense=tense,
                                    person=person,
                                    number=number,
                                    parse=False)

            if conjugated is not None:
                new_word = conjugated

        # remove underscores for joint words
        new_word = new_word.replace('_', ' ')

        return new_word
예제 #42
0
def trimSentence(word_POS):
    sentence_array = []
    for word in word_POS:
        # if word[1] == "IN":
        #     #do nothing
        #     pass
        # elif word[1] == "TO":
        #     pass
        # elif word[1] == "$":
        #     pass
        # elif word[1] == "CD":
        #     pass
        # elif word[1] == "CC":
        #     pass
        # elif word[1] == ":":
        #     pass
        # elif word[0] == "%":
        #     pass
        # elif word[0] == "pct" or word[0] == "percent":
        #     pass
        # elif word[0] == "second": #######
        #     pass
        # elif word[0] == "wo":
        #     sentence_array.append("will")
        # elif word[0] == "n't":
        #     sentence_array.append("not")
        #if its a verb, add the base of that verb
        if word[1] == "VB" or word[1] == "VBD" or word[1] == "VBG" or word[
                1] == "VBN" or word[1] == "VBP" or word[1] == "VBZ":
            base = WordNetLemmatizer().lemmatize(word[0], 'v')
            sentence_array.append(base)
        else:
            #add
            sentence_array.append(singularize(word[0]))
    return sentence_array
예제 #43
0
 def inject(self, title, word_pair):
     for i, cat in title.get_slots('NP'):
         if cat == 'plural':
             title.inject(pluralize(word_pair[0]).capitalize(), 'NP')
         else:
             title.inject(singularize(word_pair[0]).capitalize(), 'NP')
     for i, cat in title.get_slots('ADJ'):
         title.inject(word_pair[1].capitalize(), 'ADJ')
예제 #44
0
def getPluralSingular(w):
    word = w
    plural = isplural(word)
    if plural:
        word  = singularize(word)
    else:
        word  = pluralize(word)
    return word
예제 #45
0
def pluralize_singularize(word,prev_word):
    if "thing" in word:
        print word,prev_word
    if "these" in prev_word:
        return pluralize(word)
    elif "this" in prev_word:
        return singularize(word)
    else:
        return word
예제 #46
0
파일: base.py 프로젝트: vpramo/xos-1
def xproto_singularize(field):
    try:
        # The user has set a singular, as an exception that cannot be handled automatically
        singular = field['options']['singular']
        singular = unquote(singular)
    except KeyError:
        singular = en.singularize(field['name'])

    return singular
예제 #47
0
파일: base.py 프로젝트: vpramo/xos-1
def xproto_singularize_pluralize(field):
    try:
        # The user has set a plural, as an exception that cannot be handled automatically
        plural = field['options']['plural']
        plural = unquote(plural)
    except KeyError:
        plural = en.pluralize(en.singularize(field['name']))

    return plural
예제 #48
0
def key_set(full_word):
    words = []
    # hack for class etc
    if singularize(full_word) == full_word or full_word.endswith('ss'):
        plural = pluralize(full_word)
        words = [full_word, plural]
    else:
        words = [singularize(full_word), full_word, pluralize(singularize(full_word))]

    for w in words[:]:
        # if not already plural like
        if not w.endswith('s'):
            suffix = 's'
            if any([w.endswith(suf) for suf in ['x', 'z', 'ch', 'sh']]):
                suffix = 'es'
            words.append('%s%s' % (w, suffix))
    tup = tuple(sorted(list(set(words))))
    return tup
예제 #49
0
파일: test_en.py 프로젝트: daeon/pattern
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for sg, pl in Datasheet.load(os.path.join("corpora", "celex-wordforms-en.csv")):
         if en.singularize(pl) == sg:
             i +=1
         n += 1
     self.assertTrue(float(i) / n > 0.95)
     print "pattern.en.singularize()"
예제 #50
0
def c2nn(x): #函数输入chunk 输出chunk中的名词或者名词词组
    a=[]
    ss=""
    #print chunk.string
    for word in x:
        if str(word.type)[0]=="N" and checkword(word.string):
            if ss=="":
                ss=singularize(word.string)
            else:
                ss=ss+" "+singularize(word.string)
        else:
            if ss!="":
                a.append(ss)
                ss=""
       # print word.string+" "+word.type
    if ss!="":
        a.append(ss)
    #print a
    return a
예제 #51
0
 def set_ingredient_tokens(current_recipe):
     for item in current_recipe.ingredients:
         quantity_conversion = {'quarter' : 0.25,'eighth' : 0.125,
                                 'half' : 0.5,'1/4' : 0.25,
                                 '1/8' : 0.125,'1/3' : 0.333,
                                 '2/3' : 0.667,'3/4' : 0.75,
                                 '1/2' : 0.5,'1' : 1.0,
                                 '2' : 2.0,'3' : 3.0,
                                 '4' : 4.0,'5' : 5.0,
                                 '6' : 6.0,'7' : 7.0, 'lots' : 3.0,
                                 '8' : 8.0,'9' : 9.0, '5-6' : 5.5,
                                 'a' : 1.0,'few' : 2.0, 'scant' : 1.0, 
                                 'pinch' : 0.125, 'pinches' : 0.25, 
                                 '4-' : 4.0, 'to' : 0.0, 'tablespoon' : 1.0, 
                                 'teaspoon' : 1.0, 'couple' : 2.0}
                 
         #set 'dumb' quantity by assuming the first item is quanity
         prelim_quantity = nltk.tokenize.word_tokenize(item.source_line)[0]
         
         #EAFP!
         try:
             prelim_quantity = float(prelim_quantity)
         except ValueError:
             print "Can't convert :: " + prelim_quantity
             pass  # pass to conversion dictionary lookup
             try:
                 prelim_quantity = quantity_conversion[prelim_quantity]
             except KeyError:
                 print KeyError("No conversion value found : " +  prelim_quantity)
                 #need to flag here for note in UI                    
                 prelim_quantity = 0
             else:
                 item.quantity = prelim_quantity
         
         item.quantity = prelim_quantity
     
         filterList = ['tsp', 'tsps', 'tbsps', 'tbsp', 'tablespoon', \
                       'tablespoons', 'teaspoon', 'teaspoons', 'cup', \
                       'cups', 'bowl', 'pint', 'quart', 'mg', 'g', 'gram',\
                       'grams', 'ml', 'oz', 'ounce', 'ounces' ] 
         
         item.measure = ' '.join([word for word in item.source_line.split(" ") if word in filterList])
         new_source_line = ' '.join([word for word in item.source_line.split(" ") if word not in filterList])                               
         sentence = parsetree(new_source_line, chunks=True, lemmata=True)
      
         for s in sentence:
             #filter all the NP (noun phrases) into a chunk list
             chunk_list = [singularize(chunk.string) for chunk in s.chunks if chunk.type =='NP']
             search_term = chunk_list[0]
             search_term = "".join([i for i in search_term if i != '/'])
             search_term = ''.join([i for i in search_term if not i.isdigit()])                
             
             item.search_term = search_term
 
     return current_recipe
def get_singular_form_of_word(word):
    """ Get singular form of the words.
        Args:
            word (str): keyword.
        Returns:
            (str): singular form of the word.

        TODO: Or convert to base form of the words.

    """
    return singularize(word)
예제 #53
0
def make_thesaurus_lesk(file_path):
    """
    Returns dict of counters 'thesaurus', where
    thesaurus[synset] = { word1: 4, word2: 8, word3: 1, ... }
    """
    thesaurus = defaultdict(lambda: Counter())

    with open(file_path, "r") as f:

        f = f.read().split()
        for i, word_and_tag in enumerate(f):

            word, tag = word_and_tag.rsplit("_", 1)

            # Reject non-ASCII characters
            try:
                word = word.decode("ascii")
            except (UnicodeDecodeError, UnicodeEncodeError):
                continue

            # look at a window of 9 words each time lesk is called
            window = [i - WINDOW, i + WINDOW]
            if i < WINDOW:
                window = [i, i + 2 * WINDOW]
            elif i >= len(f) - WINDOW:
                window = [i - 2 * WINDOW, i]

            synset = lesk.my_lesk(f[window[0] : window[1]], word)

            # if lesk can decide on a meaning for that word, add
            # that meaning, i.e., that synset, to thesaurus
            if not synset:
                continue

            # if word is verb, only add present tense to thesaurus
            if tag[0] == "V":
                word_tenses = tenses(word.lower())
                if "inf" in word_tenses or "1sg" in word_tenses or "2sg" in word_tenses or "3sg" in word_tenses:
                    thesaurus[str(synset)].update([word.lower()])
            elif tag[0] == "N":
                synset_name = synset.name().split(".")[0]
                if synset_name == pluralize(synset_name):
                    thesaurus[str(synset)].update([pluralize(word.lower())])
                else:
                    thesaurus[str(synset)].update([singularize(word.lower())])
            else:
                thesaurus[str(synset)].update([word.lower()])
    # Update thesaurus with mappings, if map_file exists
    file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER)
    map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG)

    thesaurus = _add_mappings(map_file, thesaurus)
    return thesaurus
예제 #54
0
def docs2corpus(docs, name, isNew):
  print '>> converting documents to corpus...'
  numDocs = len(docs)
  englishStopWords = get_stopwords('english', name)
#  texts = [[word for word in doc.lower().split() if word not in englishStopWords and word.isalpha() and len(word) > 1] for doc in docs]
  texts = [[singularize(word) for word in doc.lower().split() if singularize(word) not in englishStopWords and word.isalpha() and len(word) > 1] for doc in docs]
  # remove words that appear only once
  frequency = defaultdict(int)
  for text in texts:
    for token in text:
      frequency[token] += 1
  texts = [[token for token in text if frequency[token] > 1] for text in texts]
  print len(texts)
  if isNew:
    dictionary = generate_dictionary(texts, name, numDocs) #uncomment for new corpus
  else:
    dictionary = gensim.corpora.Dictionary.load(name + '.dict')
  corpus = [dictionary.doc2bow(text) for text in texts]
  if isNew:
    gensim.corpora.MmCorpus.serialize(name + '.mm', corpus) # store to disk, for later use
  return corpus, dictionary
예제 #55
0
def getIngredientNames(index):
    # get = request.GET
    # index = int(get.get('index'))
    #
    #
# from recipes.views import *
# getIngredientNames(8279)
#
    urlBase = 'http://cooking.nytimes.com/recipes/'
    while index < 2000000:
        url = urlBase + str(index)
        print index
        index += 1
        try:
            req = urllib2.Request(url.encode("utf8"), headers={'accept': '*/*', 'User-Agent' : "Magic Browser"})
            html = urllib2.urlopen(req, timeout=10)
        except:
            continue
        soup = BeautifulSoup(html, "html5lib")
        ingredients = soup.select('.ingredient-name span')
        for i in ingredients:
            i = i.text.lower()
            if not 'nutritional information' in i:
                if ' and ' in i:
                    i = i.split(' and ')
                elif ' or ' in i:
                    i = i.split(' or ')
                elif ', ' in i:
                    i = i.split(' or ')
                else:
                    i = [i]
                for part in i:
                    if 'our' in part:
                        Ingredient.objects.get_or_create(name = part)
                    else:
                        if part != singularize(part):
                            print part, singularize(part)
                        Ingredient.objects.get_or_create(name = singularize(part))
    print 'DONE'
예제 #56
0
def custom_similarity(word, synsets, pos=None):
    word = singularize(word.lower())
    similarities = []
    if pos:
        word_synsets = wordnet.synsets(word, pos=pos)
    else:
        word_synsets = wordnet.synsets(word)
    for i in synsets:
        for j in word_synsets:
            try:
                similarities.append(wordnet.similarity(i, j))
            except Exception, e:
                pass
예제 #57
0
	def tagLemma(self, word_old):
		#print tag(word_old)
		for word, pos in tag(word_old): 
			if pos=="NNS": #plurales
				x = singularize(word)
			elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo 
				x = conjugate(word, INFINITIVE)
				#To-Do: fix this
				if x: # a veces da error al conjugar
					x = x
				else:
					x = word
			else:
				x = word  
		return x
예제 #58
0
def fullQuery(sentence):
    new_str = ""
    for word in sentence.words:
        if word.string in ['places', 'locations', 'spots']:
            continue
        new_word = singularize(word.string) if word.type == "NNS" else word.string
        new_str += new_word + " "
    singularized_sentence = parsetree(new_str, relations=True, lemmata=True)

    
    m = search('{JJ? NN+} IN {JJ? NN+}', singularized_sentence)
    query = {}
    if len(m) > 0:
        query["term"] = m[0].group(1).string
        query["location"] = m[0].group(2).string
    return query