Пример #1
0
def __english_stemming(words):
    for i in xrange(len(words)):
        words[i] = stem(words[i])
    return words
Пример #2
0
def Generate_ngrams_document (text, max_num, stop, use_pos, ptag, use_stem, split, all_pos_tags):
	
	doc_ngrams = {}
	sentences = []
	if split is None:
		sentences = NLP_sent.sentence_splitting(text, 1)
	else:
		can_sens = text.split(split)
		for can_sen in can_sens:
			sentences.extend(NLP_sent.sentence_splitting(can_sen, 1))
					
	for sentence in sentences:
		phrases = NLP_sent.phrase_splitting(sentence)		
		for phrase in phrases:
			if len(phrase) <= 2: # e.g.'ii'
				continue
			
			words = NLP_word.word_splitting(phrase.lower())
			if (use_pos):
				if phrase in all_pos_tags:
					pos_tags = all_pos_tags[phrase]
				else:
					pos_tags = NLP_word.word_pos_tagging(words)
					all_pos_tags[phrase] = pos_tags
			
			stop_pos = [] # record all positions of stop  or non-preferred (POS) words in the phrase to increase efficiency
			for i in xrange(len(words)):
				type = word_checking_stop(words[i], stop)
				stop_pos.append(type)
				if use_stem: # enable or disable stemming
					words[i] = porter2.stem(words[i])
			
#  			if  "patients who underwent" in phrase:
#  				print "aa"
			for i in xrange(len(words)):
				if 0 < stop_pos[i] < 5:
					continue
				for j in xrange(i+1, min(len(words), i+max_num)+1):
					if 0 < stop_pos[j-1] < 5:# check validity
						continue
 					meaningful_word = False
 					if (j == i +1):
 						if (stop_pos[i] == 0) and (not use_pos or (use_pos and word_checking_pos(pos_tags[i], ptag) == 0)):
 							meaningful_word = True
 					else:  
#  						if (use_pos and word_checking_pos(pos_tags[j-1], ptag) == 1):
#  							continue		
						mless_num = 0		
						for k in xrange(i,j):
							if stop_pos[k] ==0 or stop_pos[k]==5:
								meaningful_word =True
							else:
								mless_num +=1
						if mless_num>=(j-i-1):
							continue
  					if (meaningful_word):
						ngram = ' '.join(words[i:j])
						if len(ngram)>1: # at least two characters
							if (ngram in doc_ngrams):
									doc_ngrams[ngram] += 1
							else:
								doc_ngrams[ngram] = 1
		
	return doc_ngrams		
Пример #3
0
def standardizing(words, umls, stop):
    eng = False
    # map to umls
    if umls is not None:
        pwords = []
        status = []
        i = 0
        while i < len(words):
            fnd = False
            if kernel_mining.word_checking_stop(words[i],
                                                stop) in [5, 0
                                                          ]:  # judge stop word
                for j in reversed(xrange(i + 1, len(words) + 1)):
                    if kernel_mining.word_checking_stop(
                            words[j - 1], stop) in [5, 0]:  # judge stop word
                        s = ' '.join(words[i:j])
                        if (
                                s in umls.norm
                        ):  # another filter is "and (len(umls.norm[s]) <= 10)"
                            cl = int(50)
                            fs = None
                            for pt in umls.norm[s]:
                                dpt = pt.decode('utf-8')
                                # retain same
                                if dpt == s.decode('utf-8'):
                                    fs = s.decode('utf-8')
                                    break
                                # acronym
                                if len(umls.norm[s]) > 1:
                                    tkn = dpt.split()
                                    if len(tkn) == len(s):
                                        init = set(s)
                                        acr = len(tkn)
                                        for t in tkn:
                                            if t[0] in init:
                                                acr -= 1
                                        if acr == 0:
                                            fs = dpt
                                            break
                                # retain shorter
                                if (len(dpt) < cl):
                                    fs = dpt
                                    cl = len(dpt)
                            s = fs
                        if s in umls.semantic:
                            if (len(umls.stype) == 0) or (
                                    len(umls.semantic[s] & umls.stype) > 0):
                                pwords.append(s.encode('utf-8'))
                                status.append(True)
                                fnd = True
                                i = j
                                continue  # do not stop the iterations to get subsets
            if fnd is False:
                pwords.append(words[i])
                status.append(False)
                i += 1
        # not found any umls term, ngram not valid
        if True not in status:
            return None
        # english stemmer
        if eng is True:
            for i in xrange(len(pwords)):
                if status[i] is False:
                    pwords[i] = stem(pwords[i])
        # processing repetition
        uwords = set(pwords)
        if len(uwords) <= math.floor(len(pwords) / float(2)):
            return None
        return ' '.join(pwords)
    elif eng is True:
        # english only
        return ' '.join(__english_stemming(words))
    else:
        # nothing to do
        return ' '.join(words)
def stem_phrase(phrase):
    words = phrase.split()
    for i in range(0, len(words)):
        words[i] = porter2.stem(words[i])

    return ' '.join(words)