def func(): model = NGram(log=True) print(len(model.vocab), len(model.vocab)**2) model.train(smoothing='interpolation') # return # model.train(smoothing='add-k') test_sents_from_text = [ 'Нет, лучше сказать', 'Генерал еще вчера обещал', 'Воображаю, как Мари удивлялась', # 'Они ей рассказали, что', # 'князь рассказал всё в подробности', 'я вас сейчас обниму и поцелую', # 'благодаря хорошей погоде уже распустились все деревья', ] print('from text:') for sent in test_sents_from_text: print(f'probability of "{sent}": {model.probability(sent)}') test_sents_not_from_text = [ 'Машина Тьюринга является расширением', 'интуитивный алгоритм может быть реализован', 'процесс пошагового вычисления, в котором', 'можно вычислить всё, что можно', ] print('not from text:') for sent in test_sents_not_from_text: print(f'probability of "{sent}": {model.probability(sent)}')
def simtitle(request): """calculate similarity based on title and naive threshold""" n = NGram(warp=2.5, iconv=enrich) articles = Article.objects.filter( status="live").order_by("date_published")[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter(lambda a: a[1] >= 0.7, n.search(article.title)) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare(article.title, nearest.title) < 0.7: results.append(article) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append(article) n.add(article) return render(request, "dump.html", dictionary={ "article_list": results, })
def build_cast_index(movies, tvshows, key): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = [[cast[key] for cast in entity['cast']] for entity in entities] values = list(set(itertools.chain.from_iterable(values))) mapped_entities = {} for entity in entities: for cast in entity['cast']: value = cast[key] if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug('Iterating {} took {} ms'.format( key, int((time.time() - start) * 1000))) start = time.time() index = NGram() for value in values: index.add(value) logger.debug('Building {} index took {} ms'.format( key, int((time.time() - start) * 1000))) return index, mapped_entities
def calc_common_word_ngram(text_sentence, question): common = 0 # Noktalama isaretlerinden temizlendi text_sentence = remove_punctuation(text_sentence) question = remove_punctuation(question) # Kucuk harf ve kelime kelime parcalama islemi text_sentence = tr_lower(text_sentence).strip() question = tr_lower(question).strip() n = NGram(N=N_GRAM) list_text_sentence = list(n.split(text_sentence)) list_question = list(n.split(question)) # print(list_text_sentence) # print() # print(list_question) # input('') for question_word in list_question: if question_word in list_text_sentence: # print(question_word) common += 1 return common
def _similarity(self, sentences): """ N-gram with similarity. The NGram class extends the Python ‘set’ class with efficient fuzzy search for members by means of an N-gram similarity measure. Reference: Vacláv Chvátal and David Sankoff. Longest common subsequences of two random sequences, 1975. Journal of Applied Probability, Python module: ngram (https://pypi.org/project/ngram/) """ ngram = NGram(self.corpus.split(), key=lambda x: x.lower(), N=self.N) predicts = [] if not isinstance(sentences, list): sentences = [sentences] for i in range(len(sentences)): split = [] for x in sentences[i].split(): sugg = ngram.find( x.lower()) if x not in string.punctuation else None split.append(sugg if sugg else x) predicts.append(" ".join(split)) return predicts
def main(left_path, left_column, right_path, right_column, outfile, titles, join, minscore, count, warp): """Perform the similarity join""" right_file = csv.reader(open(right_path, 'r')) if titles: right_header = next(right_file) index = NGram((tuple(r) for r in right_file), threshold=minscore, warp=warp, key=lambda x: lowstrip(x[right_column])) left_file = csv.reader(open(left_path, 'r')) out = csv.writer(open(outfile, 'w')) if titles: left_header = next(left_file) out.writerow(left_header + ["Rank", "Similarity"] + right_header) for row in left_file: if not row: continue # skip blank lines row = tuple(row) results = index.search(lowstrip(row[left_column]), threshold=minscore) if results: if count > 0: results = results[:count] for rank, result in enumerate(results, 1): out.writerow(row + (rank, result[1]) + result[0]) elif join == "outer": out.writerow(row)
def __init__(self, lang, io_dir, max_file_count=0, thread_count=1, uri=None): self.BASE_DIR = BASE_DIR self.lang = lang self.io_dir = os.path.join(io_dir, self.lang) self.max_file_count = max_file_count self.thread_count = thread_count self.client = MongoClient(uri) self.db = self.client['nlp'] self.Collect = self.db['kcm'] # ngram search self.kcmNgram = NGram((i['key'] for i in self.Collect.find({}, { 'key': 1, '_id': False }))) logging.basicConfig(format='%(levelname)s : %(asctime)s : %(message)s', filename='KCM_{}.log'.format(self.lang), level=logging.INFO) logging.info('Begin gen_kcm.py') logging.info('input {self.max_file_count} files, ' 'output to {self.io_dir}, ' 'maximum file count {self.max_file_count}, ' 'use {self.thread_count} threads'.format(**locals()))
def dynamicallyAddNewLanguage(self, lan): self.LANGUAGES.append(lan) self.LanToNGrams[lan] = NGram(lan=lan, N=self.N, V=self.V, weight=self.weight) self.LanToIndex[lan] = len(self.LanToIndex) newRow = [[0 for i in range(0, len(self.confusion_matrix[0]))]] self.confusion_matrix = np.append(self.confusion_matrix, newRow, 0) newCol = [[0] for i in range(0, len(self.confusion_matrix))] self.confusion_matrix = np.append(self.confusion_matrix, newCol, 1) while len(self._TP_per_class) != len(self.LANGUAGES): self._precision_per_class = np.hstack( [self._precision_per_class, np.array([0.0])]) self._recall_per_class = np.hstack( [self._recall_per_class, np.array([0.0])]) self._TP_per_class = np.hstack( [self._TP_per_class, np.array([0.0])]) self._FP_per_class = np.hstack( [self._FP_per_class, np.array([0.0])]) self._FN_per_class = np.hstack( [self._FN_per_class, np.array([0.0])]) self._TN_per_class = np.hstack( [self._TN_per_class, np.array([0.0])]) self.probability_per_language = np.hstack( [self.probability_per_language, np.array([0.0])])
def build_genre_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = list( set( itertools.chain.from_iterable( [entity['genre'] for entity in entities]))) mapped_entities = {} for entity in entities: for genre in entity['genre']: if genre not in mapped_entities: mapped_entities[genre] = [] mapped_entities[genre].append(entity) logger.debug('Iterating genre took {} ms'.format( int((time.time() - start) * 1000))) start = time.time() index = NGram(items=values, key=lambda x: x.lower()) logger.debug('Building genre index took {} ms'.format( int((time.time() - start) * 1000))) return index, mapped_entities
def sonucbul(): kelimeler = list() v = NGram(ngramdatawords) sonucthreshold = list() sonuckelime = list() kelimedizisi = np.zeros((1, len(ngramdatawords)), dtype='int8') yorum = e1.get() ############### cevirici = str.maketrans('', '', punctuation) yorum = yorum.translate(cevirici) cevirici = str.maketrans('', '', digits) yorum = yorum.translate(cevirici) yorum = yorum.lower() kelimeler.clear() kelimeler = yorum.split() for j in range(0, len(kelimeler), 1): sonucthreshold.clear() sonuckelime.clear() for ngrami in v.search(kelimeler[j], threshold=0.4): sonuckelime.append(str(ngrami[0])) sonucthreshold.append(int(ngrami[1])) if (len(sonuckelime) != 0): kelimedizisi[0][ngramdatawords.index( sonuckelime[sonucthreshold.index(max(sonucthreshold))])] += 1 tmpdf = pd.DataFrame(kelimedizisi) sonuc = ngrammodel.predict(tmpdf) cevirici = str.maketrans('', '', punctuation) cevap = str(sonuc).translate(cevirici) print("Yorum= " + yorum + " Yorum Sonucu= " + str(sonuc)) e1.delete(0, END) Label(master, text="Puan(1-5) =" + str(cevap)).grid(row=2)
def build_title_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = [entity['title'] for entity in entities] mapped_entities = {} for entity in entities: value = entity['title'] if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug('Iterating title took {} ms'.format( int((time.time() - start) * 1000))) start = time.time() index = NGram() for value in values: index.add(value) logger.debug('Building title index took {} ms'.format( int((time.time() - start) * 1000))) return index, mapped_entities
def filterByOp(self,clone): opStr1 = "" opStr2 = "" indx1,start1,end1 = clone[1] indx2,start2,end2 = clone[2] for i in range(start1,end1+1): opStr1 += str(self.op1_hash.get(i,-1)) for i in range(start2,end2+1): opStr2 += str(self.op2_hash.get(i,-1)) if config.DEBUG is True: print "start1 = %d, end1 = %d, ops = %s" % (start1,end1,opStr1) print "start2 = %d, end2 = %d, ops = %s" % (start2,end2,opStr2) # if ((self.hasChanged(opStr1) is False) or # (self.hasChanged(opStr2) is False)): if not (self.hasChanged(opStr1) and self.hasChanged(opStr2)): return None idx = NGram(N=config.NGRAM) ngram1 = list(idx.ngrams(opStr1)) ngram2 = list(idx.ngrams(opStr2)) metric = self.compareList(ngram1,ngram2) return metric
def build_collection_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = list( set([ parse_collection(entity['set']) for entity in entities if 'set' in entity and len(entity['set']) > 0 ])) mapped_entities = {} for entity in entities: if 'set' in entity and entity['set']: value = parse_collection(entity['set']) if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug('Iterating collection took {} ms'.format( int((time.time() - start) * 1000))) start = time.time() index = NGram() for value in values: index.add(value) logger.debug('Building collection index took {} ms'.format( int((time.time() - start) * 1000))) return index, mapped_entities
def update_dict(self, word): if word.lower() not in self.stop_words: if self.word_dict.has_key(word): self.word_dict[word].increment_freq() else: self.word_dict[word] = NGram(1, word, 1) else: pass
def update_dict(self, phrase): #if word.lower() not in self.stop_words: phrase_str = " ".join(phrase) if self.word_dict.has_key(phrase_str): self.word_dict[phrase_str].increment_freq() else: self.word_dict[phrase_str] = NGram(int(self.argv[3]), phrase_str, 1)
def test_set_operations(self): """Test advanced set operations""" items1 = set(["abcde", "cdefg", "fghijk", "ijklm"]) items2 = set(["cdefg", "lmnop"]) idx1 = NGram(items1) idx2 = NGram(items2) results = lambda L: sorted(x[0] for x in L) # Item removal self.assertEqual(results(idx1.search('cde')), ["abcde", "cdefg"]) idx1.remove('abcde') self.assertEqual(results(idx1.search('cde')), ["cdefg"]) # Set intersection operation items1.remove('abcde') idx1.intersection_update(idx2) self.assertEqual(idx1, items1.intersection(items2)) self.assertEqual(results(idx1.search('lmn')), []) self.assertEqual(results(idx1.search('ijk')), []) self.assertEqual(results(idx1.search('def')), ['cdefg'])
def update_dict(self, word): if word in self.attr_dict: #if the word in the article has a given attribute if self.attribute in self.attr_dict[word]: self.attr_count += 1 if word in self.word_dict: self.word_dict[word].increment_freq() else: self.word_dict[word] = NGram(1, word, 1) else: pass
def test(): filter = opFilter() opStr1 = "nnn+" opStr2 = "nn+" idx = NGram(N=config.NGRAM) l1 = list(idx.ngrams(opStr1)) l2 = list(idx.ngrams(opStr2)) print filter.compareList(l1,l2)
def __init__(self, text): self.ng = [] self.topic = [] file = open(text, "r") for linea in file.readlines(): words = linea.split(" ") self.topic += [words[0]] nng = NGram(words[2].split(",")) self.ng += [nng] file.close()
def update_dict(self, phrase): phrase_str = " ".join(phrase) if self.word_dict.has_key(phrase_str): self.word_dict[phrase_str].increment_freq() else: self.word_dict[phrase_str] = NGram( \ len(phrase), \ phrase_str, \ 1, \ self.get_sentiment_val(phrase), \ self.get_topic_ref_count(phrase_str) )
def ngram_similarity(data, col1, col2): cos = [] for i in range(len(data.id)): st = data[col1][i] title = data[col2][i] n = NGram(title.split(), key=lambda x: x[1]) for s in st.split(): n.search(s) tfidf = sktf.TfidfVectorizer().fit_transform([st, title]) c = ((tfidf * tfidf.T).A)[0, 1] cos.append(c) return cos
def update_dict(self, phrase): #if word.lower() not in self.stop_words: phrase_str = " ".join(phrase) if self.word_dict.has_key(phrase_str): self.word_dict[phrase_str].increment_freq() else: self.word_dict[phrase_str] = NGram( \ int(self.argv[3]), \ phrase_str, \ 1, \ self.get_sentiment_val(phrase), 0, #don't care about topic_ref self.get_welcome_val(phrase) )
def __init__(self, model_dir=None, conf_file=None): if model_dir is None: model_dir = self.MODEL_DIR if not path.isdir(model_dir): raise ValueError('Directory does not exist: %s' % (model_dir)) if conf_file is None: conf_file = self.CONF_FILE conf_file = path.abspath(path.join(model_dir, conf_file)) if not path.isfile(conf_file): raise ValueError('File does not exist: %s' % (conf_file)) self._load_config(conf_file) self.ngram = NGram(model_dir)
def test_init_1gram(self): ngram = NGram(1, self.sents) counts = { (): 10, ('el', ): 1, ('gato', ): 1, ('come', ): 2, ('pescado', ): 1, ('.', ): 2, ('la', ): 1, ('gata', ): 1, ('salmón', ): 1, } print("ngram.counts:") print(dict(ngram.counts)) self.assertEqual(dict(ngram.counts), counts)
def test_count_1gram(self): ngram = NGram(1, self.sents) counts = { (): 12, ('el', ): 1, ('gato', ): 1, ('come', ): 2, ('pescado', ): 1, ('.', ): 2, ('</s>', ): 2, ('la', ): 1, ('gata', ): 1, ('salmón', ): 1, } for gram, c in counts.items(): self.assertEqual(ngram.count(gram), c)
def __init__(self,n,loaded = False): """ Constructor Parameters ---------- n : int ngram size """ self.ngram_gen = NGram(N=n) self.size = n self.ngram_index = {"":0} self.index_ngram = {0:""} self.cpt = 0 self.max_len = 0 self.loaded = loaded
def ngram(w1, w2, n): """ ngram distance """ pad = lambda x : "#{}#".format(x) w1, w2 = pad(w1), pad(w2) g1 = [w1[i:i+n] for i in range(len(w1)-n+1)] g2 = [w2[i:i+n] for i in range(len(w2)-n+1)] # compute ngram similarity # d(a, b) = |a| + |b| + 2|a intersection b| n = NGram(g1) n.intersection_update(g2) d = len(g1) + len(g2) - 2 * len(list(n)) return d
def test_ngram_search(self): """Tests from the original ngram.py, to check that the rewrite still uses the same underlying algorithm""" # Basic searching of the index idx = NGram(self.items) self.assertEqual(idx.search('askfjwehiuasdfji'), [('askfjwehiuasdfji', 1.0), ('asdfawe', 0.17391304347826086), ('asfwef', 0.083333333333333329), ('adfwe', 0.041666666666666664)]) self.assertEqual( idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998), ('asdfawe', 0.20000000000000001)]) # Pairwise comparison of strings self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0) self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def get_ngram_similarity(gold, candidates, N=3, strip_space=True): def _strip_space(s): if not strip_space: return s return "\n".join([part.strip(" ") for part in s.split("\n")]) ng = NGram([_strip_space(gold)], N=N) sims = [] for c in candidates: ng_out = ng.search(_strip_space(c)) if len(ng_out) == 0: sims.append(0.0) else: sims.append(ng_out[0][1]) return sims
def __init__(self, lang, uri=None, ngram=False): self.client = pymongo.MongoClient(uri) self.uri = uri self.lang = lang self.db = self.client['nlp_{}'.format(self.lang)] self.fs = gridfs.GridFS(self.db) self.Collect = self.db['pmi'] self.cpus = math.ceil(mp.cpu_count() * 0.2) self.frequency = {} if ngram: # use ngram for searching self.pmiNgram = NGram((i['key'] for i in self.db['pmi'].find({}, { 'key': 1, '_id': False })))