def filterByOp(self,clone): opStr1 = "" opStr2 = "" indx1,start1,end1 = clone[1] indx2,start2,end2 = clone[2] for i in range(start1,end1+1): opStr1 += str(self.op1_hash.get(i,-1)) for i in range(start2,end2+1): opStr2 += str(self.op2_hash.get(i,-1)) if config.DEBUG is True: print "start1 = %d, end1 = %d, ops = %s" % (start1,end1,opStr1) print "start2 = %d, end2 = %d, ops = %s" % (start2,end2,opStr2) # if ((self.hasChanged(opStr1) is False) or # (self.hasChanged(opStr2) is False)): if not (self.hasChanged(opStr1) and self.hasChanged(opStr2)): return None idx = NGram(N=config.NGRAM) ngram1 = list(idx.ngrams(opStr1)) ngram2 = list(idx.ngrams(opStr2)) metric = self.compareList(ngram1,ngram2) return metric
def simtitle( request ): """calculate similarity based on title and naive threshold""" n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title ) articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) ) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare( article.title, nearest.title ) < 0.7: results.append( article ) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append( article ) n.add( article ) return render( request, "dump.html", dictionary = { "article_list": results, } )
def build_multiclusters(inlines, threshold=0.05, N=4): clusters = [] ignoreus = [] for i, iline in enumerate(inlines): if i in ignoreus: continue iString = " ".join(iline.split(" :::: ")[:3]) ignoreus.append(i) icluster = {} icluster[iline] = -1 iModel = NGram(iString) for j in range(i, len(inlines)): if j in ignoreus: continue jline = inlines[j] jString = " ".join(jline.split(" :::: ")[:3]) results = iModel.search(jString) score = sum([y for x,y in results]) / len(results) \ if len(results) > 0 else 0.0 print score if score > threshold: icluster[jline] = score iModel.add(jString) ignoreus.append(j) clusters.append(icluster) return clusters
def main(left_path, left_column, right_path, right_column, outfile, titles, join, minscore, count, warp): """Perform the similarity join""" right_file = csv.reader(open(right_path, 'r')) if titles: right_header = next(right_file) index = NGram((tuple(r) for r in right_file), threshold=minscore, warp=warp, key=lambda x: lowstrip(x[right_column])) left_file = csv.reader(open(left_path, 'r')) out = csv.writer(open(outfile, 'w'), lineterminator='\n') if titles: left_header = next(left_file) out.writerow(left_header + ["Rank", "Similarity"] + right_header) for row in left_file: if not row: continue # skip blank lines row = tuple(row) results = index.search(lowstrip(row[left_column]), threshold=minscore) if results: if count > 0: results = results[:count] for rank, result in enumerate(results, 1): out.writerow(row + (rank, result[1]) + result[0]) elif join == "outer": out.writerow(row)
def map(self,phrase): for term in phrase: if len(term) > 4: continue for word in self.corpus: z = Set(term) & Set(word) matches = [] if len(z) > 0 and len(z) < len(term): # # g=NGram(z - Set(term)) #matches = g.search(term) else: # # At this point we assume context is not informative # In the advent of context not being informative, we resort to fuzzy lookup # g = NGram(word) #matches = g.search(term) g.remove(term) matches = g.search(term) key = None value = None if len(matches) > 0: matches = list(matches[0]) Pz_ = len(matches) / self.size Px_ = fuzz.ratio(term,matches[0]) / 100 if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4: key = term value= {} value= [matches[0],Pz_,Px_,1] self.emit (key,value)
def select_translation(sentence, idx, word, translations): # make sure the subject pronoun is in subject form # heuristic: if it's the first word or the previous word is punctuation # or conjunction, it's considered a subject if word[1] == 'r' and word[0] in subject_pronoun: if idx == 0 or sentence[idx-1][1] in ['x', 'c']: return (subject_pronoun[word[0]], 'pron') # handle special case: <digits>/m 日/m if word[1] == 'm': if DIGITS_PATTERN.match(word[0]): if idx+1 < len(sentence) and sentence[idx+1][0] == u'日': # return proper date string return (translate_date(int(word[0])), 'n') else: # return digits directly return (word[0], 'n') elif word[0] == u'日': # symmetric case if idx > 0 and DIGITS_PATTERN.match(sentence[i-1][0]): return ('', '') # construct a list of translations with the same pos as word same_pos_translations = filter(lambda t: match_pos(word[1], t[1]), translations) ng = NGram() if len(same_pos_translations) > 0: max_unigram_trans = max(same_pos_translations, key=lambda t: ng.get(t[0])) return max_unigram_trans return translations[0]
def test(): filter = opFilter() opStr1 = "nnn+" opStr2 = "nn+" idx = NGram(N=config.NGRAM) l1 = list(idx.ngrams(opStr1)) l2 = list(idx.ngrams(opStr2)) print filter.compareList(l1,l2)
def ngram_similarity(univ_name): out = {} with open("static/UniqueFBUnivNames.csv", 'rb') as f: reader = csv.reader(f) for row in reader: row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row)) row = re.sub(' ', ' ', str(row)) out['score'] = NGram.compare(str(row).lower(), univ_name, N=1) if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5: out['score_used'] = NGram.compare(str(row).lower(), univ_name) out['univ'] = str(row) return out return out
def main(left_path, left_column, right_path, right_column, outfile, titles, join, minscore, count, warp): """Perform the similarity join >>> open('left.csv', 'w').write('''ID,NAME ... 1,Joe ... 2,Kin ... 3,ZAS''') >>> open('right.csv', 'w').write('''ID,NAME ... ID,NAME ... A,Joe ... B,Jon ... C,Job ... D,Kim''') >>> main(left_path='left.csv', left_column=1, ... right_path='right.csv', right_column=1, outfile='out.csv', ... titles=True, join='outer', minscore=0.24, count=5, warp=1.0) >>> print open('out.csv').read() #doctest: +NORMALIZE_WHITESPACE ID,NAME,Rank,Similarity,ID,NAME 1,Joe,1,1.0,A,Joe 1,Joe,2,0.25,B,Jon 1,Joe,3,0.25,C,Job 2,Kin,1,0.25,D,Kim 3,ZAS <BLANKLINE> """ right_file = csv.reader(open(right_path, 'r')) if titles: right_header = right_file.next() index = NGram((tuple(r) for r in right_file), threshold=minscore, warp=warp, key=lambda x: lowstrip(x[right_column])) left_file = csv.reader(open(left_path, 'r')) out = csv.writer(open(outfile, 'w')) if titles: left_header = left_file.next() out.writerow(left_header + ["Rank", "Similarity"] + right_header) for row in left_file: if not row: continue # skip blank lines row = tuple(row) results = index.search(lowstrip(row[left_column]), threshold=minscore) if results: if count > 0: results = results[:count] for rank, result in enumerate(results, 1): out.writerow(row + (rank, result[1]) + result[0]) elif join == "outer": out.writerow(row)
def verify(self,text_compare): results = [] dictio = [] file2 = open(text_compare,"r") linea2 = file2.readline() while linea2 != '': if linea2 != '\n': dictio += [self.ng.items_sharing_ngrams(linea2)] compares = 0.0 for parrafo in self.lsn: comp = NGram.compare(parrafo,linea2) if compares < comp: compares = comp results += [compares] linea2 = file2.readline() file2.close() major_ocurrences=[] for d in dictio: major=0 for val in d.values(): if major<val: major=val major_ocurrences+=[major] avg_perc=0.0 for r in results: avg_perc+=r avg_perc=avg_perc/len(results) print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences)) print("Porcentaje Similitud: "+repr(avg_perc))
def get_distr(strlist, n_len): alphabet = ['A', 'C', 'G', 'T', 'N'] n = NGram(N=n_len, pad_len=0) all_ngrams = 0 grams = init_grams_dict(n_len, alphabet) for item in strlist: if item == '': continue ngram_list = list(n._split(item)) for ng in ngram_list: if ng in grams: grams[ng] += float(1) all_ngrams += 1 for item in grams.keys(): grams[item] /= all_ngrams return grams
def main(): questions_path, answers_path = sys.argv[1:] print("Reading Corpus:") train_sentences = read_corpus('train_data', disp=True) print('\nTraining on Corpus') model = NGram.train_model(train_sentences, disp=True) with open(answers_path, 'r') as answer_file: answers = get_sentences(untokenized_text=answer_file.read(), is_tokenized=True, token_start_end=('<s>', '</s>')) dev_sentences = answers[:520] print('Calculating Probabilities for Dev Sentences:') model.sentences_probabilities(dev_sentences, disp=True) lambdas = optimize_lambdas(model) with open(questions_path, 'r') as question_file: questions = get_sentences(untokenized_text=question_file.read(), is_tokenized=True, token_start_end=('<s>', '</s>')) print('Calculating Probabilities for Test Sentences:') model.sentences_probabilities(sentences=questions, disp=True) _, sentences_perplexity = model.perplexity(lambdas=lambdas) print('Writing sentences and perplexities to file') with open('output.txt', 'w') as out_file: for i, perplexity in enumerate(sentences_perplexity): out_file.write('{}\t{}\n'.format(' '.join(questions[i]).replace('<s0> <s1>', '<s>'), perplexity))
def compare_ngrams(left, right, N=2, pad_len=0): left = ascii(left) right = ascii(right) if len(left) == 1 and len(right) == 1: # NGram.compare returns 0.0 for 1 letter comparison, even if letters # are equal. return 1.0 if left == right else 0.0 return NGram.compare(left, right, N=N, pad_len=pad_len)
def test_count_1gram(self): ngram = NGram(1, self.sents) counts = { (): 12, ('el',): 1, ('gato',): 1, ('come',): 2, ('pescado',): 1, ('.',): 2, ('</s>',): 2, ('la',): 1, ('gata',): 1, ('salmón',): 1, } for gram, c in counts.items(): self.assertEqual(ngram.count(gram), c)
def test_ngram_search(self): """Tests from the original ngram.py, to check that the rewrite still uses the same underlying algorithm""" # Basic searching of the index idx = NGram(self.items) self.assertEqual(idx.search('askfjwehiuasdfji'), [ ('askfjwehiuasdfji', 1.0), ('asdfawe', 0.17391304347826086), ('asfwef', 0.083333333333333329), ('adfwe', 0.041666666666666664)]) self.assertEqual(idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998), ('asdfawe', 0.20000000000000001)]) # Pairwise comparison of strings self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0) self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def backoff_score_strings(iline, jline, N, T=0.0): iString = " ".join(iline.split(" :::: ")[:3]) jString = " ".join(jline.split(" :::: ")[:3]) score = -1 while score <= T and N >= 1: score = NGram.compare(iString, jString, N=N) N = N - 1 return score
def cumulative_score_strings(iline, jline, N): iString = " ".join(iline.split(" :::: ")[:3]) jString = " ".join(jline.split(" :::: ")[:3]) score = 0 while N >= 1: score += (NGram.compare(iString, jString, N=N)) #* N) N = N - 1 return score
def wordsoccurrences(self, words_list, option='ortony'): frequencies = FreqDist(words_list) ordered_unigrams = frequencies.most_common() if option == 'ortony': lexicon = self.ortony_list else: lexicon = self.profane_words count = 0 for t_word, count_w in ordered_unigrams: lower_word = t_word.lower() three_grams = NGram(lexicon) likely_words = three_grams.search(lower_word) if len(likely_words) > 0: # if lower_word in lexicon: count += 1 * count_w if lower_word in lexicon: count += 1 return count
def __init__(self,text): self.ng=NGram() file = open(text,"r") linea = file.readline() while linea != '': if linea != '\n': self.ng.add(linea) linea = file.readline() self.lsn=list(self.ng); file.close()
def verify(self,text_compare): results = [] texto = [] ''' file2 = open(text_compare,"r") for linea2 in file2.readlines(): texto+=linea2.split(" ") tng=NGram(texto) file2.close() ''' file2 = open(text_compare,"r") linea2 = file2.readline() while linea2 != '': texto+=linea2.split(" ") linea2 = file2.readline() tng=NGram(texto) file2.close() for ngs in self.ng: count=0 for word in list(ngs): for porc in tng.search(word): if porc[1]>0.3: count+=1 results+=[count] print list(results) pos=0 count=0 i=0 for res in results: if count<res: count=res pos=i i+=1 if results[pos]>2: print("Tema mas preciso del texto: "+repr(self.topic[pos])) else: print("No se ha podido precisar de que trata") print ""
def test_set_operations(self): """Test advanced set operations""" items1 = set(["abcde", "cdefg", "fghijk", "ijklm"]) items2 = set(["cdefg", "lmnop"]) idx1 = NGram(items1) idx2 = NGram(items2) results = lambda L: sorted(x[0] for x in L) # Item removal self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"]) idx1.remove('abcde') self.assertEqual(results(idx1.search('cde')), ["cdefg"]) # Set intersection operation items1.remove('abcde') idx1.intersection_update(idx2) self.assertEqual(idx1, items1.intersection(items2)) self.assertEqual(results(idx1.search('lmn')), []) self.assertEqual(results(idx1.search('ijk')), []) self.assertEqual(results(idx1.search('def')), ['cdefg'])
def __init__(self, model_dir=None, conf_file=None): if model_dir is None: model_dir = self.MODEL_DIR if not path.isdir(model_dir): raise ValueError('Directory does not exist: %s' % (model_dir)) if conf_file is None: conf_file = self.CONF_FILE conf_file = path.abspath(path.join(model_dir, conf_file)) if not path.isfile(conf_file): raise ValueError('File does not exist: %s' % (conf_file)) self._load_config(conf_file) self.ngram = NGram(model_dir)
def guess_image(name): ''' Guess which meme image they mean by finding the alias with greatest ngram similarity ''' name = tokenize(name) best = '404' best_score = None for guess_image, names in IMAGES.iteritems(): for guess in names: score = NGram.compare(guess, name) if best_score is None or score > best_score: best_score = score best = guess_image app.logger.info('Pick image %s for name "%s"' % (best, name)) return best
def guess_meme_image(meme_name): ''' Guess which meme image they mean by finding the alias with greatest ngram similarity ''' meme_name = tokenize(meme_name) best = '' best_score = None for guess_image, names in MEMES.items(): for guess in names: guess = tokenize(guess) score = NGram.compare(guess, meme_name) if best_score is None or score > best_score: best_score = score best = guess_image app.logger.info('New best meme for "%s": "%s" (Score: %s)', meme_name, guess, score) app.logger.info('Picked meme "%s" for name "%s"' % (best, meme_name)) return best
def smart_read(url): resp = urllib2.urlopen(url) #resolve url url = resp.url domain = urlparse(url).netloc path = urlparse(url).path html = resp.read() tree = etree.parse(StringIO.StringIO(html), parser) links = tree.xpath("//body//@href") nmax = 0 for link in links: if urlparse(link).netloc == domain: ng = NGram.compare(urlparse(link).path,path) #print link,ng if ng > nmax and ng < 1: nmax = ng mirror = link diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"]) tree = etree.parse(StringIO.StringIO(diffh), parser) diff = tree.xpath("//ins//text()") for d in diff: print d
def process(hr,sr,he,se): categories_relevant = {} categories_extracted = {} category_idx_list = [] for i,h in enumerate(hr): for j,h1 in enumerate(he): if NGram.compare(hr[i], he[j]) > 0.95: category_idx_list.append((i,j)) if he: if len(he) != len(se): return 0 , 0 for i,C in enumerate(category_idx_list): categories_relevant[i] = sr[C[0]] tmp = se[C[1]].replace('\r', '').replace('\n','') categories_extracted[i] = tmp e = Evaluator(categories_relevant, categories_extracted) p, r = e.evaluate_using_ngrams(3) return p, r
def breakCipher(self,enc,dictionary): ngrams = NGram() uniEnc = ngrams.unigrams(enc) biEnc = ngrams.bigrams(enc) triEnc = ngrams.trigrams(enc) uniDictionary = ngrams.unigrams(dictionary) biDictionary = ngrams.bigrams(dictionary) triDictionary = ngrams.trigrams(dictionary) sort_tg_enc = sorted(uniEnc,key=uniEnc.get,reverse=True) sort_tg_dic = sorted(uniDictionary,key=uniDictionary.get,reverse=True) key = {} for index,x in enumerate(sort_tg_enc): key[x] = sort_tg_dic[index] # text = "" # for x in enc: # if x == " " or x == "\n": # text += x # else: # text += key[x] h = SimulatedAnnealing() h.breakOpen(enc,key,uniDictionary,triDictionary)
class NgramIndex(): """ Class used for encoding words in ngram representation """ def __init__(self,n,loaded = False): """ Constructor Parameters ---------- n : int ngram size """ self.ngram_gen = NGram(N=n) self.size = n self.ngram_index = {"":0} self.index_ngram = {0:""} self.cpt = 0 self.max_len = 0 self.loaded = loaded def split_and_add(self,word): """ Split word in multiple ngram and add each one of them to the index Parameters ---------- word : str a word """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) [self.add(ngram) for ngram in ngrams] self.max_len = max(self.max_len,len(ngrams)) def add(self,ngram): """ Add a ngram to the index Parameters ---------- ngram : str ngram """ if not ngram in self.ngram_index: self.cpt+=1 self.ngram_index[ngram]=self.cpt self.index_ngram[self.cpt]=ngram def encode(self,word): """ Return a ngram representation of a word Parameters ---------- word : str a word Returns ------- list of int listfrom shapely.geometry import Point,box of ngram index """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) return [self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index] def complete(self,ngram_encoding,MAX_LEN,filling_item=0): """ Complete a ngram encoded version of word with void ngram. It's necessary for neural network. Parameters ---------- ngram_encoding : list of int first encoding of a word MAX_LEN : int desired length of the encoding filling_item : int, optional ngram index you wish to use, by default 0 Returns ------- list of int list of ngram index """ if self.loaded and len(ngram_encoding) >=MAX_LEN: return ngram_encoding[:MAX_LEN] assert len(ngram_encoding) <= MAX_LEN diff = MAX_LEN - len(ngram_encoding) ngram_encoding.extend([filling_item]*diff) return ngram_encoding def save(self,fn): """ Save the NgramIndex Parameters ---------- fn : str output filename """ data = { "ngram_size": self.size, "ngram_index": self.ngram_index, "cpt_state": self.cpt, "max_len_state": self.max_len } json.dump(data,open(fn,'w')) @staticmethod def load(fn): """ Load a NgramIndex state from a file. Parameters ---------- fn : str input filename Returns ------- NgramIndex ngram index Raises ------ KeyError raised if a required field does not appear in the input file """ try: data = json.load(open(fn)) except json.JSONDecodeError: print("Data file must be a JSON") for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]: if not key in data: raise KeyError("{0} field cannot be found in given file".format(key)) new_obj = NgramIndex(data["ngram_size"],loaded=True) new_obj.ngram_index = data["ngram_index"] new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} new_obj.cpt = data["cpt_state"] new_obj.max_len = data["max_len_state"] return new_obj
def sim( a, b ): return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich )
def get_ngrams(term, n=3): term = re.sub(r'[-_.__/\\ ]', '', term.lower()) return list(NGram(N=n)._split(term))
def print_dict(self): sorted_values = sorted(self.word_dict.values(), key=lambda ngram: ngram.frequency) for value in sorted_values: NGram.print_attrib(value)
def test_prob(self): ngram = NGram(2, self.sents) self.assertEqual(ngram.prob('pescado', ['come']), 0.5) self.assertEqual(ngram.prob('salmón', ['come']), 0.5)
def test_prob_1gram(self): ngram = NGram(1, self.sents) self.assertEqual(ngram.prob('pescado'), 0.1) self.assertEqual(ngram.prob('come'), 0.2)