def filterByOp(self,clone):
        opStr1 = ""
        opStr2 = ""
        indx1,start1,end1 = clone[1]
        indx2,start2,end2 = clone[2]

        for i in range(start1,end1+1):
            opStr1 += str(self.op1_hash.get(i,-1))
        for i in range(start2,end2+1):
            opStr2 += str(self.op2_hash.get(i,-1))

        if config.DEBUG is True:
            print "start1 = %d, end1 = %d, ops = %s" % (start1,end1,opStr1)
            print "start2 = %d, end2 = %d, ops = %s" % (start2,end2,opStr2)

#        if ((self.hasChanged(opStr1) is False) or
#            (self.hasChanged(opStr2) is False)):
        if not (self.hasChanged(opStr1) and self.hasChanged(opStr2)):
            return None

        idx = NGram(N=config.NGRAM)
        ngram1 = list(idx.ngrams(opStr1))
        ngram2 = list(idx.ngrams(opStr2))
        metric = self.compareList(ngram1,ngram2)

        return metric
Exemplo n.º 2
0
def simtitle( request ):
    """calculate similarity based on title and naive threshold"""
    n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
    articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare( article.title, nearest.title ) < 0.7:
                    results.append( article )
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append( article )
        n.add( article )
    return render( request, "dump.html", dictionary = { "article_list": results, } )
Exemplo n.º 3
0
def build_multiclusters(inlines, threshold=0.05, N=4):
	clusters = []
	ignoreus = []

	for i, iline in enumerate(inlines):
		if i in ignoreus:
			continue

		iString = " ".join(iline.split(" :::: ")[:3])

		ignoreus.append(i)

		icluster = {}
		icluster[iline] = -1
		iModel = NGram(iString)

		for j in range(i, len(inlines)):
			if j in ignoreus:
				continue
		
			jline = inlines[j]
			jString = " ".join(jline.split(" :::: ")[:3])
		
			results = iModel.search(jString)
			score = sum([y for x,y in results]) / len(results) \
					if len(results) > 0 else 0.0
			print score

			if score > threshold:
				icluster[jline] = score
				iModel.add(jString)
				ignoreus.append(j)

		clusters.append(icluster)
	return clusters
Exemplo n.º 4
0
def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join"""
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = next(right_file)
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'), lineterminator='\n')
    if titles:
        left_header = next(left_file)
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
Exemplo n.º 5
0
	def map(self,phrase):
		for term in phrase:
			if len(term) > 4:
				continue
			for word in self.corpus:
				z = Set(term) & Set(word)
				
				matches = []
				if len(z) > 0 and len(z) < len(term):
					#
					#
					g=NGram(z - Set(term))
					#matches = g.search(term)
				else:
					#
					# At this point we assume context is not informative
					# In the advent of context not being informative, we resort to fuzzy lookup
					#		
					g = NGram(word)
					#matches = g.search(term)
				g.remove(term)
				matches = g.search(term)
				key = None
				value = None					
				if len(matches) > 0:
					matches = list(matches[0])
					Pz_ = len(matches) / self.size
					Px_ = fuzz.ratio(term,matches[0]) / 100
					if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4:
						key = term
						value= {}
						value= [matches[0],Pz_,Px_,1]
						self.emit (key,value)
Exemplo n.º 6
0
def select_translation(sentence, idx, word, translations):
    # make sure the subject pronoun is in subject form
    # heuristic: if it's the first word or the previous word is punctuation
    # or conjunction, it's considered a subject
    if word[1] == 'r' and word[0] in subject_pronoun:
        if idx == 0 or sentence[idx-1][1] in ['x', 'c']:
            return (subject_pronoun[word[0]], 'pron')

    # handle special case: <digits>/m 日/m
    if word[1] == 'm':
        if DIGITS_PATTERN.match(word[0]):
            if idx+1 < len(sentence) and sentence[idx+1][0] == u'日':
                # return proper date string
                return (translate_date(int(word[0])), 'n')
            else:
                # return digits directly
                return (word[0], 'n')
        elif word[0] == u'日':
            # symmetric case
            if idx > 0 and DIGITS_PATTERN.match(sentence[i-1][0]):
                return ('', '')

    # construct a list of translations with the same pos as word
    same_pos_translations = filter(lambda t: match_pos(word[1], t[1]), translations)

    ng = NGram()

    if len(same_pos_translations) > 0:
        max_unigram_trans = max(same_pos_translations, key=lambda t: ng.get(t[0]))
        return max_unigram_trans

    return translations[0]
def test():
    filter = opFilter()

    opStr1 = "nnn+"
    opStr2 = "nn+"

    idx = NGram(N=config.NGRAM)
    l1 = list(idx.ngrams(opStr1))
    l2 = list(idx.ngrams(opStr2))

    print filter.compareList(l1,l2)
def ngram_similarity(univ_name):
    out = {}
    with open("static/UniqueFBUnivNames.csv", 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
            row = re.sub('  ', ' ', str(row))
            out['score'] = NGram.compare(str(row).lower(), univ_name, N=1)
            if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5:
                out['score_used'] = NGram.compare(str(row).lower(), univ_name)
                out['univ'] = str(row)
                return out
    return out
Exemplo n.º 9
0
def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join

    >>> open('left.csv', 'w').write('''ID,NAME
    ... 1,Joe
    ... 2,Kin
    ... 3,ZAS''')

    >>> open('right.csv', 'w').write('''ID,NAME
    ... ID,NAME
    ... A,Joe
    ... B,Jon
    ... C,Job
    ... D,Kim''')
    >>> main(left_path='left.csv', left_column=1,
    ... right_path='right.csv', right_column=1, outfile='out.csv',
    ... titles=True, join='outer', minscore=0.24, count=5, warp=1.0)
    >>> print open('out.csv').read()  #doctest: +NORMALIZE_WHITESPACE
    ID,NAME,Rank,Similarity,ID,NAME
    1,Joe,1,1.0,A,Joe
    1,Joe,2,0.25,B,Jon
    1,Joe,3,0.25,C,Job
    2,Kin,1,0.25,D,Kim
    3,ZAS
    <BLANKLINE>
    """
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = right_file.next()
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'))
    if titles:
        left_header = left_file.next()
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
Exemplo n.º 10
0
	def verify(self,text_compare):
		results = []
		dictio = []
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':	
			if linea2 != '\n':
				dictio += [self.ng.items_sharing_ngrams(linea2)]
				compares = 0.0
				for parrafo in self.lsn:
					comp = NGram.compare(parrafo,linea2)
					if compares < comp:
						compares = comp
				results += [compares]
			linea2 = file2.readline()
		file2.close()

		major_ocurrences=[]
		for d in dictio:
			major=0
			for val in d.values():
				if major<val:
					major=val
			major_ocurrences+=[major]
			

		avg_perc=0.0
		for r in results:
			avg_perc+=r
		avg_perc=avg_perc/len(results)

		print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
		print("Porcentaje Similitud: "+repr(avg_perc))
def get_distr(strlist, n_len):
    alphabet = ['A', 'C', 'G', 'T', 'N']
    n = NGram(N=n_len, pad_len=0)
    all_ngrams = 0
    grams = init_grams_dict(n_len, alphabet)
    for item in strlist:
        if item == '':
            continue
        ngram_list = list(n._split(item))
        for ng in ngram_list:
            if ng in grams:
                grams[ng] += float(1)
                all_ngrams += 1
    for item in grams.keys():
        grams[item] /= all_ngrams
    return grams
Exemplo n.º 12
0
def main():
    questions_path, answers_path = sys.argv[1:]

    print("Reading Corpus:")
    train_sentences = read_corpus('train_data', disp=True)

    print('\nTraining on Corpus')
    model = NGram.train_model(train_sentences, disp=True)

    with open(answers_path, 'r') as answer_file:
        answers = get_sentences(untokenized_text=answer_file.read(),
                                is_tokenized=True,
                                token_start_end=('<s>', '</s>'))

    dev_sentences = answers[:520]

    print('Calculating Probabilities for Dev Sentences:')
    model.sentences_probabilities(dev_sentences, disp=True)
    lambdas = optimize_lambdas(model)

    with open(questions_path, 'r') as question_file:
        questions = get_sentences(untokenized_text=question_file.read(),
                                  is_tokenized=True,
                                  token_start_end=('<s>', '</s>'))

    print('Calculating Probabilities for Test Sentences:')
    model.sentences_probabilities(sentences=questions, disp=True)
    _, sentences_perplexity = model.perplexity(lambdas=lambdas)

    print('Writing sentences and perplexities to file')
    with open('output.txt', 'w') as out_file:
        for i, perplexity in enumerate(sentences_perplexity):
            out_file.write('{}\t{}\n'.format(' '.join(questions[i]).replace('<s0> <s1>', '<s>'), perplexity))
Exemplo n.º 13
0
def compare_ngrams(left, right, N=2, pad_len=0):
    left = ascii(left)
    right = ascii(right)
    if len(left) == 1 and len(right) == 1:
        # NGram.compare returns 0.0 for 1 letter comparison, even if letters
        # are equal.
        return 1.0 if left == right else 0.0
    return NGram.compare(left, right, N=N, pad_len=pad_len)
Exemplo n.º 14
0
    def test_count_1gram(self):
        ngram = NGram(1, self.sents)

        counts = {
            (): 12,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('</s>',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c)
Exemplo n.º 15
0
    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'), [
            ('askfjwehiuasdfji', 1.0),
            ('asdfawe', 0.17391304347826086),
            ('asfwef', 0.083333333333333329),
            ('adfwe', 0.041666666666666664)])
        self.assertEqual(idx.search('afadfwe')[:2],
                [('adfwe', 0.59999999999999998),
                 ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
Exemplo n.º 16
0
def backoff_score_strings(iline, jline, N, T=0.0):
	iString = " ".join(iline.split(" :::: ")[:3])
	jString = " ".join(jline.split(" :::: ")[:3])

	score = -1
	while score <= T and N >= 1:
		score = NGram.compare(iString, jString, N=N)
		N = N - 1

	return score
Exemplo n.º 17
0
def cumulative_score_strings(iline, jline, N):
	iString = " ".join(iline.split(" :::: ")[:3])
	jString = " ".join(jline.split(" :::: ")[:3])

	score = 0
	while N >= 1:
		score += (NGram.compare(iString, jString, N=N)) #* N)
		N = N - 1

	return score
Exemplo n.º 18
0
    def wordsoccurrences(self, words_list, option='ortony'):
        frequencies = FreqDist(words_list)
        ordered_unigrams = frequencies.most_common()
        if option == 'ortony':
            lexicon = self.ortony_list
        else:
            lexicon = self.profane_words
        count = 0
        for t_word, count_w in ordered_unigrams:
            lower_word = t_word.lower()
            three_grams = NGram(lexicon)
            likely_words = three_grams.search(lower_word)
            if len(likely_words) > 0:
                # if lower_word in lexicon:
                count += 1 * count_w

            if lower_word in lexicon:
                count += 1
        return count
Exemplo n.º 19
0
	def __init__(self,text):
		self.ng=NGram()
		file = open(text,"r")
		linea = file.readline()
		while linea != '':
			if linea != '\n':
				self.ng.add(linea)
			linea = file.readline()
		self.lsn=list(self.ng);
		file.close()
Exemplo n.º 20
0
	def verify(self,text_compare):
		results = []
		texto = []
		'''
		file2 = open(text_compare,"r")
		for linea2 in file2.readlines():
			texto+=linea2.split(" ")
		tng=NGram(texto)
		file2.close()
		'''
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':
			texto+=linea2.split(" ")
			linea2 = file2.readline()
		tng=NGram(texto)
		file2.close()

		for ngs in self.ng:
			count=0
			for word in list(ngs):
				for porc in tng.search(word):
					if porc[1]>0.3:
						count+=1
			results+=[count]

		print list(results)

		pos=0
		count=0
		i=0
		for res in results:
			if count<res:
				count=res
				pos=i
			i+=1

		if results[pos]>2:
			print("Tema mas preciso del texto: "+repr(self.topic[pos]))
		else:
			print("No se ha podido precisar de que trata")
		print ""			
Exemplo n.º 21
0
 def test_set_operations(self):
     """Test advanced set operations"""
     items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
     items2 = set(["cdefg", "lmnop"])
     idx1 = NGram(items1)
     idx2 = NGram(items2)
     results = lambda L: sorted(x[0] for x in L)
     # Item removal
     self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"])
     idx1.remove('abcde')
     self.assertEqual(results(idx1.search('cde')), ["cdefg"])
     # Set intersection operation
     items1.remove('abcde')
     idx1.intersection_update(idx2)
     self.assertEqual(idx1, items1.intersection(items2))
     self.assertEqual(results(idx1.search('lmn')), [])
     self.assertEqual(results(idx1.search('ijk')), [])
     self.assertEqual(results(idx1.search('def')), ['cdefg'])
Exemplo n.º 22
0
    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._load_config(conf_file)
        self.ngram = NGram(model_dir)
Exemplo n.º 23
0
def guess_image(name):
    '''
    Guess which meme image they mean by finding the alias with greatest ngram
    similarity
    '''
    name = tokenize(name)
    best = '404'
    best_score = None
    for guess_image, names in IMAGES.iteritems():
        for guess in names:
            score = NGram.compare(guess, name)
            if best_score is None or score > best_score:
                best_score = score
                best = guess_image
    app.logger.info('Pick image %s for name "%s"' % (best, name))
    return best
Exemplo n.º 24
0
def guess_meme_image(meme_name):
    '''
    Guess which meme image they mean by finding the alias with greatest ngram
    similarity
    '''
    meme_name = tokenize(meme_name)
    best = ''
    best_score = None
    for guess_image, names in MEMES.items():
        for guess in names:
            guess = tokenize(guess)
            score = NGram.compare(guess, meme_name)
            if best_score is None or score > best_score:
                best_score = score
                best = guess_image
                app.logger.info('New best meme for "%s": "%s" (Score: %s)', meme_name, guess, score)
    app.logger.info('Picked meme "%s" for name "%s"' % (best, meme_name))
    return best
Exemplo n.º 25
0
def smart_read(url):
    resp = urllib2.urlopen(url)
    #resolve url
    url = resp.url
    domain = urlparse(url).netloc
    path = urlparse(url).path
    
    html = resp.read()
    tree = etree.parse(StringIO.StringIO(html), parser)
    links = tree.xpath("//body//@href")
    nmax = 0
    for link in links:
        if urlparse(link).netloc == domain:
            ng = NGram.compare(urlparse(link).path,path)
            #print link,ng
            if ng > nmax and ng < 1:
                nmax = ng
                mirror = link
    diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"])
    tree = etree.parse(StringIO.StringIO(diffh), parser)
    diff = tree.xpath("//ins//text()")
    for d in diff:
        print d
Exemplo n.º 26
0
def process(hr,sr,he,se):

    categories_relevant = {}
    categories_extracted = {}

    category_idx_list = []
    for i,h in enumerate(hr):
        for j,h1 in enumerate(he):
            if NGram.compare(hr[i], he[j]) > 0.95:
                category_idx_list.append((i,j))

    if he:
        if len(he) != len(se):
            return 0 , 0
    for i,C in enumerate(category_idx_list):
        categories_relevant[i] = sr[C[0]]
        tmp = se[C[1]].replace('\r', '').replace('\n','')
        categories_extracted[i] = tmp

    e = Evaluator(categories_relevant, categories_extracted)
    p, r = e.evaluate_using_ngrams(3)

    return p, r
Exemplo n.º 27
0
	def breakCipher(self,enc,dictionary):
		
		ngrams = NGram()

		uniEnc = ngrams.unigrams(enc)

		biEnc = ngrams.bigrams(enc)

		triEnc = ngrams.trigrams(enc)


		uniDictionary = ngrams.unigrams(dictionary)

		biDictionary = ngrams.bigrams(dictionary)

		triDictionary = ngrams.trigrams(dictionary)

		sort_tg_enc = sorted(uniEnc,key=uniEnc.get,reverse=True)
		sort_tg_dic = sorted(uniDictionary,key=uniDictionary.get,reverse=True)

		key = {}

		for index,x in enumerate(sort_tg_enc):
			key[x] = sort_tg_dic[index]

		# text = ""
		# for x in enc:
		# 	if x == " " or x == "\n":
		# 		text += x
		# 	else:
		# 		text += key[x]

		h = SimulatedAnnealing()

		h.breakOpen(enc,key,uniDictionary,triDictionary)
		
Exemplo n.º 28
0
class NgramIndex():
    """
    Class used for encoding words in ngram representation
    """
    def __init__(self,n,loaded = False):
        """
        Constructor
        
        Parameters
        ----------
        n : int
            ngram size
        """
        self.ngram_gen = NGram(N=n)

        self.size = n
        self.ngram_index = {"":0}
        self.index_ngram = {0:""}
        self.cpt = 0
        self.max_len = 0

        self.loaded = loaded
    def split_and_add(self,word):
        """
        Split word in multiple ngram and add each one of them to the index
        
        Parameters
        ----------
        word : str
            a word
        """
        ngrams = word.lower().replace(" ","$")
        ngrams = list(self.ngram_gen.split(ngrams))
        [self.add(ngram) for ngram in ngrams]
        self.max_len = max(self.max_len,len(ngrams))

    def add(self,ngram):
        """
        Add a ngram to the index
        
        Parameters
        ----------
        ngram : str
            ngram
        """
        if not ngram in self.ngram_index:
            self.cpt+=1
            self.ngram_index[ngram]=self.cpt
            self.index_ngram[self.cpt]=ngram
        

    def encode(self,word):
        """
        Return a ngram representation of a word
        
        Parameters
        ----------
        word : str
            a word
        
        Returns
        -------
        list of int
            listfrom shapely.geometry import Point,box
 of ngram index
        """
        ngrams = word.lower().replace(" ","$")
        ngrams = list(self.ngram_gen.split(ngrams))
        return [self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index]

    def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
        """
        Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
        
        Parameters
        ----------
        ngram_encoding : list of int
            first encoding of a word
        MAX_LEN : int
            desired length of the encoding
        filling_item : int, optional
            ngram index you wish to use, by default 0
        
        Returns
        -------
        list of int
            list of ngram index
        """
        if self.loaded and len(ngram_encoding) >=MAX_LEN:
            return ngram_encoding[:MAX_LEN]
        assert len(ngram_encoding) <= MAX_LEN
        diff = MAX_LEN - len(ngram_encoding)
        ngram_encoding.extend([filling_item]*diff)  
        return ngram_encoding

    def save(self,fn):
        """

        Save the NgramIndex
        
        Parameters
        ----------
        fn : str
            output filename
        """
        data = {
            "ngram_size": self.size,
            "ngram_index": self.ngram_index,
            "cpt_state": self.cpt,
            "max_len_state": self.max_len
        }
        json.dump(data,open(fn,'w'))

    @staticmethod
    def load(fn):
        """
        
        Load a NgramIndex state from a file.
        
        Parameters
        ----------
        fn : str
            input filename
        
        Returns
        -------
        NgramIndex
            ngram index
        
        Raises
        ------
        KeyError
            raised if a required field does not appear in the input file
        """
        try:
            data = json.load(open(fn))
        except json.JSONDecodeError:
            print("Data file must be a JSON")
        for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
            if not key in data:
                raise KeyError("{0} field cannot be found in given file".format(key))
        new_obj = NgramIndex(data["ngram_size"],loaded=True)
        new_obj.ngram_index = data["ngram_index"]
        new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
        new_obj.cpt = data["cpt_state"]
        new_obj.max_len = data["max_len_state"]
        return new_obj
Exemplo n.º 29
0
 def sim( a, b ):
     return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich )
Exemplo n.º 30
0
def get_ngrams(term, n=3):
    term = re.sub(r'[-_.__/\\ ]', '', term.lower())
    return list(NGram(N=n)._split(term))
        
Exemplo n.º 31
0
 def print_dict(self):
     sorted_values = sorted(self.word_dict.values(),
                            key=lambda ngram: ngram.frequency)
     for value in sorted_values:
         NGram.print_attrib(value)
Exemplo n.º 32
0
    def test_prob(self):
        ngram = NGram(2, self.sents)

        self.assertEqual(ngram.prob('pescado', ['come']), 0.5)
        self.assertEqual(ngram.prob('salmón', ['come']), 0.5)
Exemplo n.º 33
0
    def test_prob_1gram(self):
        ngram = NGram(1, self.sents)

        self.assertEqual(ngram.prob('pescado'), 0.1)
        self.assertEqual(ngram.prob('come'), 0.2)