def calc_common_word_ngram(text_sentence, question): common = 0 # Noktalama isaretlerinden temizlendi text_sentence = remove_punctuation(text_sentence) question = remove_punctuation(question) # Kucuk harf ve kelime kelime parcalama islemi text_sentence = tr_lower(text_sentence).strip() question = tr_lower(question).strip() n = NGram(N=N_GRAM) list_text_sentence = list(n.split(text_sentence)) list_question = list(n.split(question)) # print(list_text_sentence) # print() # print(list_question) # input('') for question_word in list_question: if question_word in list_text_sentence: # print(question_word) common += 1 return common
class NGramMixSegment(Segment): def __init__(self, N=(1, 2), pad_len=0): self.model1 = NGram(N=N[0], pad_len=pad_len) self.model2 = NGram(N=N[1], pad_len=pad_len) def cut(self, sentence): """ 分词. @params: sentence - 待分词文本. @return: On success - 单词列表. On failure - 错误信息. """ words = list(self.model1.split(sentence)) words.extend(list(self.model2.split(sentence))) return words
class NGramSegment(Segment): def __init__(self, N=2, pad_len=0): self.model = NGram(N=N, pad_len=pad_len) def cut(self, sentence): """ 分词. @params: sentence - 待分词文本. @return: On success - 单词列表. On failure - 错误信息. """ return list(self.model.split(sentence))
print 'jaro winkler', sim_arry6 sim_arry7 = [ jellyfish.match_rating_comparison(unicode(string[0]), unicode(s)) for s in string ] print 'match rating comparison', sim_arry7 # tokens = word_tokenize([string]) # print(string_token) # print tfidf_matrix # print(y.toarray() ngram_array = [word_grams(s.split(' ')) for s in string] # print ngram_array n = NGram() # print list(n.split(string[0])) ngram_array = [list(n.split(s)) for s in string] # print ngram_array sim_arry8 = [NGram.compare(string[0].lower(), s.lower(), N=4) for s in string] print 'ngram', sim_arry8 def jaccard_distance(a, b): # print a, b inter_len = float(len(list(a.intersection(b)))) union_len = float(len(list(a.union(b)))) return inter_len / union_len # print list(ngram_array[0].intersection(ngram_array[1])) sim_arry9 = [ jaccard_distance(NGram(ngram_array[0]), NGram(s)) for s in ngram_array
def ngram_str(s, N=3): s = s.replace(" ", "_") ngram = NGram(N=N) return " ".join(ngram.split(s))
class NgramIndex(): """ Class used for encoding words in ngram representation """ def __init__(self,n,loaded = False): """ Constructor Parameters ---------- n : int ngram size """ self.ngram_gen = NGram(N=n) self.size = n self.ngram_index = {"":0} self.index_ngram = {0:""} self.cpt = 0 self.max_len = 0 self.loaded = loaded def split_and_add(self,word): """ Split word in multiple ngram and add each one of them to the index Parameters ---------- word : str a word """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) [self.add(ngram) for ngram in ngrams] self.max_len = max(self.max_len,len(ngrams)) def add(self,ngram): """ Add a ngram to the index Parameters ---------- ngram : str ngram """ if not ngram in self.ngram_index: self.cpt+=1 self.ngram_index[ngram]=self.cpt self.index_ngram[self.cpt]=ngram def encode(self,word): """ Return a ngram representation of a word Parameters ---------- word : str a word Returns ------- list of int listfrom shapely.geometry import Point,box of ngram index """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) return [self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index] def complete(self,ngram_encoding,MAX_LEN,filling_item=0): """ Complete a ngram encoded version of word with void ngram. It's necessary for neural network. Parameters ---------- ngram_encoding : list of int first encoding of a word MAX_LEN : int desired length of the encoding filling_item : int, optional ngram index you wish to use, by default 0 Returns ------- list of int list of ngram index """ if self.loaded and len(ngram_encoding) >=MAX_LEN: return ngram_encoding[:MAX_LEN] assert len(ngram_encoding) <= MAX_LEN diff = MAX_LEN - len(ngram_encoding) ngram_encoding.extend([filling_item]*diff) return ngram_encoding def save(self,fn): """ Save the NgramIndex Parameters ---------- fn : str output filename """ data = { "ngram_size": self.size, "ngram_index": self.ngram_index, "cpt_state": self.cpt, "max_len_state": self.max_len } json.dump(data,open(fn,'w')) @staticmethod def load(fn): """ Load a NgramIndex state from a file. Parameters ---------- fn : str input filename Returns ------- NgramIndex ngram index Raises ------ KeyError raised if a required field does not appear in the input file """ try: data = json.load(open(fn)) except json.JSONDecodeError: print("Data file must be a JSON") for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]: if not key in data: raise KeyError("{0} field cannot be found in given file".format(key)) new_obj = NgramIndex(data["ngram_size"],loaded=True) new_obj.ngram_index = data["ngram_index"] new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} new_obj.cpt = data["cpt_state"] new_obj.max_len = data["max_len_state"] return new_obj