Exemplo n.º 1
0
def func():
    model = NGram(log=True)
    print(len(model.vocab), len(model.vocab)**2)
    model.train(smoothing='interpolation')
    # return
    # model.train(smoothing='add-k')
    test_sents_from_text = [
        'Нет, лучше сказать',
        'Генерал еще вчера обещал',
        'Воображаю, как Мари удивлялась',
        # 'Они ей рассказали, что',
        # 'князь рассказал всё в подробности',
        'я вас сейчас обниму и поцелую',
        # 'благодаря хорошей погоде уже распустились все деревья',
    ]

    print('from text:')
    for sent in test_sents_from_text:
        print(f'probability of "{sent}": {model.probability(sent)}')

    test_sents_not_from_text = [
        'Машина Тьюринга является расширением',
        'интуитивный алгоритм может быть реализован',
        'процесс пошагового вычисления, в котором',
        'можно вычислить всё, что можно',
    ]
    print('not from text:')
    for sent in test_sents_not_from_text:
        print(f'probability of "{sent}": {model.probability(sent)}')
Exemplo n.º 2
0
def simtitle(request):
    """calculate similarity based on title and naive threshold"""
    n = NGram(warp=2.5, iconv=enrich)
    articles = Article.objects.filter(
        status="live").order_by("date_published")[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter(lambda a: a[1] >= 0.7, n.search(article.title))
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare(article.title, nearest.title) < 0.7:
                    results.append(article)
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append(article)
        n.add(article)
    return render(request, "dump.html", dictionary={
        "article_list": results,
    })
Exemplo n.º 3
0
def build_cast_index(movies, tvshows, key):
    start = time.time()

    entities = list(itertools.chain.from_iterable([movies, tvshows]))
    values = [[cast[key] for cast in entity['cast']] for entity in entities]
    values = list(set(itertools.chain.from_iterable(values)))

    mapped_entities = {}
    for entity in entities:
        for cast in entity['cast']:
            value = cast[key]
            if value not in mapped_entities:
                mapped_entities[value] = []

            mapped_entities[value].append(entity)

    logger.debug('Iterating {} took {} ms'.format(
        key, int((time.time() - start) * 1000)))

    start = time.time()
    index = NGram()
    for value in values:
        index.add(value)
    logger.debug('Building {} index took {} ms'.format(
        key, int((time.time() - start) * 1000)))

    return index, mapped_entities
Exemplo n.º 4
0
def calc_common_word_ngram(text_sentence, question):
    common = 0

    # Noktalama isaretlerinden temizlendi
    text_sentence = remove_punctuation(text_sentence)
    question = remove_punctuation(question)

    # Kucuk harf ve kelime kelime parcalama islemi
    text_sentence = tr_lower(text_sentence).strip()
    question = tr_lower(question).strip()

    n = NGram(N=N_GRAM)
    list_text_sentence = list(n.split(text_sentence))
    list_question = list(n.split(question))

    # print(list_text_sentence)
    # print()
    # print(list_question)
    # input('')

    for question_word in list_question:
        if question_word in list_text_sentence:
            # print(question_word)
            common += 1

    return common
Exemplo n.º 5
0
    def _similarity(self, sentences):
        """
        N-gram with similarity.

        The NGram class extends the Python ‘set’ class with efficient fuzzy search for members by
        means of an N-gram similarity measure.

        Reference:
            Vacláv Chvátal and David Sankoff.
            Longest common subsequences of two random sequences, 1975.
            Journal of Applied Probability,

            Python module: ngram (https://pypi.org/project/ngram/)
        """

        ngram = NGram(self.corpus.split(), key=lambda x: x.lower(), N=self.N)
        predicts = []

        if not isinstance(sentences, list):
            sentences = [sentences]

        for i in range(len(sentences)):
            split = []

            for x in sentences[i].split():
                sugg = ngram.find(
                    x.lower()) if x not in string.punctuation else None
                split.append(sugg if sugg else x)

            predicts.append(" ".join(split))

        return predicts
Exemplo n.º 6
0
def main(left_path, left_column, right_path, right_column, outfile, titles,
         join, minscore, count, warp):
    """Perform the similarity join"""
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = next(right_file)
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp,
                  key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'))
    if titles:
        left_header = next(left_file)
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue  # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
Exemplo n.º 7
0
    def __init__(self,
                 lang,
                 io_dir,
                 max_file_count=0,
                 thread_count=1,
                 uri=None):
        self.BASE_DIR = BASE_DIR
        self.lang = lang
        self.io_dir = os.path.join(io_dir, self.lang)
        self.max_file_count = max_file_count
        self.thread_count = thread_count

        self.client = MongoClient(uri)
        self.db = self.client['nlp']
        self.Collect = self.db['kcm']

        # ngram search
        self.kcmNgram = NGram((i['key'] for i in self.Collect.find({}, {
            'key': 1,
            '_id': False
        })))
        logging.basicConfig(format='%(levelname)s : %(asctime)s : %(message)s',
                            filename='KCM_{}.log'.format(self.lang),
                            level=logging.INFO)
        logging.info('Begin gen_kcm.py')
        logging.info('input {self.max_file_count} files, '
                     'output to {self.io_dir}, '
                     'maximum file count {self.max_file_count}, '
                     'use {self.thread_count} threads'.format(**locals()))
Exemplo n.º 8
0
    def dynamicallyAddNewLanguage(self, lan):
        self.LANGUAGES.append(lan)
        self.LanToNGrams[lan] = NGram(lan=lan,
                                      N=self.N,
                                      V=self.V,
                                      weight=self.weight)
        self.LanToIndex[lan] = len(self.LanToIndex)
        newRow = [[0 for i in range(0, len(self.confusion_matrix[0]))]]
        self.confusion_matrix = np.append(self.confusion_matrix, newRow, 0)
        newCol = [[0] for i in range(0, len(self.confusion_matrix))]

        self.confusion_matrix = np.append(self.confusion_matrix, newCol, 1)

        while len(self._TP_per_class) != len(self.LANGUAGES):
            self._precision_per_class = np.hstack(
                [self._precision_per_class,
                 np.array([0.0])])
            self._recall_per_class = np.hstack(
                [self._recall_per_class,
                 np.array([0.0])])
            self._TP_per_class = np.hstack(
                [self._TP_per_class, np.array([0.0])])
            self._FP_per_class = np.hstack(
                [self._FP_per_class, np.array([0.0])])
            self._FN_per_class = np.hstack(
                [self._FN_per_class, np.array([0.0])])
            self._TN_per_class = np.hstack(
                [self._TN_per_class, np.array([0.0])])
            self.probability_per_language = np.hstack(
                [self.probability_per_language,
                 np.array([0.0])])
Exemplo n.º 9
0
def build_genre_index(movies, tvshows):
    start = time.time()

    entities = list(itertools.chain.from_iterable([movies, tvshows]))
    values = list(
        set(
            itertools.chain.from_iterable(
                [entity['genre'] for entity in entities])))

    mapped_entities = {}
    for entity in entities:
        for genre in entity['genre']:
            if genre not in mapped_entities:
                mapped_entities[genre] = []

            mapped_entities[genre].append(entity)

    logger.debug('Iterating genre took {} ms'.format(
        int((time.time() - start) * 1000)))

    start = time.time()
    index = NGram(items=values, key=lambda x: x.lower())
    logger.debug('Building genre index took {} ms'.format(
        int((time.time() - start) * 1000)))

    return index, mapped_entities
Exemplo n.º 10
0
def sonucbul():
    kelimeler = list()

    v = NGram(ngramdatawords)
    sonucthreshold = list()
    sonuckelime = list()

    kelimedizisi = np.zeros((1, len(ngramdatawords)), dtype='int8')
    yorum = e1.get()  ###############
    cevirici = str.maketrans('', '', punctuation)
    yorum = yorum.translate(cevirici)
    cevirici = str.maketrans('', '', digits)
    yorum = yorum.translate(cevirici)
    yorum = yorum.lower()
    kelimeler.clear()
    kelimeler = yorum.split()
    for j in range(0, len(kelimeler), 1):
        sonucthreshold.clear()
        sonuckelime.clear()
        for ngrami in v.search(kelimeler[j], threshold=0.4):
            sonuckelime.append(str(ngrami[0]))
            sonucthreshold.append(int(ngrami[1]))
        if (len(sonuckelime) != 0):
            kelimedizisi[0][ngramdatawords.index(
                sonuckelime[sonucthreshold.index(max(sonucthreshold))])] += 1
    tmpdf = pd.DataFrame(kelimedizisi)
    sonuc = ngrammodel.predict(tmpdf)
    cevirici = str.maketrans('', '', punctuation)
    cevap = str(sonuc).translate(cevirici)
    print("Yorum= " + yorum + " Yorum Sonucu= " + str(sonuc))

    e1.delete(0, END)
    Label(master, text="Puan(1-5) =" + str(cevap)).grid(row=2)
Exemplo n.º 11
0
def build_title_index(movies, tvshows):
    start = time.time()

    entities = list(itertools.chain.from_iterable([movies, tvshows]))
    values = [entity['title'] for entity in entities]

    mapped_entities = {}
    for entity in entities:
        value = entity['title']
        if value not in mapped_entities:
            mapped_entities[value] = []

        mapped_entities[value].append(entity)

    logger.debug('Iterating title took {} ms'.format(
        int((time.time() - start) * 1000)))

    start = time.time()
    index = NGram()
    for value in values:
        index.add(value)
    logger.debug('Building title index took {} ms'.format(
        int((time.time() - start) * 1000)))

    return index, mapped_entities
Exemplo n.º 12
0
    def filterByOp(self,clone):
        opStr1 = ""
        opStr2 = ""
        indx1,start1,end1 = clone[1]
        indx2,start2,end2 = clone[2]

        for i in range(start1,end1+1):
            opStr1 += str(self.op1_hash.get(i,-1))
        for i in range(start2,end2+1):
            opStr2 += str(self.op2_hash.get(i,-1))

        if config.DEBUG is True:
            print "start1 = %d, end1 = %d, ops = %s" % (start1,end1,opStr1)
            print "start2 = %d, end2 = %d, ops = %s" % (start2,end2,opStr2)

#        if ((self.hasChanged(opStr1) is False) or
#            (self.hasChanged(opStr2) is False)):
        if not (self.hasChanged(opStr1) and self.hasChanged(opStr2)):
            return None

        idx = NGram(N=config.NGRAM)
        ngram1 = list(idx.ngrams(opStr1))
        ngram2 = list(idx.ngrams(opStr2))
        metric = self.compareList(ngram1,ngram2)

        return metric
Exemplo n.º 13
0
def build_collection_index(movies, tvshows):
    start = time.time()

    entities = list(itertools.chain.from_iterable([movies, tvshows]))
    values = list(
        set([
            parse_collection(entity['set']) for entity in entities
            if 'set' in entity and len(entity['set']) > 0
        ]))

    mapped_entities = {}
    for entity in entities:
        if 'set' in entity and entity['set']:
            value = parse_collection(entity['set'])
            if value not in mapped_entities:
                mapped_entities[value] = []

            mapped_entities[value].append(entity)

    logger.debug('Iterating collection took {} ms'.format(
        int((time.time() - start) * 1000)))

    start = time.time()
    index = NGram()
    for value in values:
        index.add(value)
    logger.debug('Building collection index took {} ms'.format(
        int((time.time() - start) * 1000)))

    return index, mapped_entities
Exemplo n.º 14
0
 def update_dict(self, word):
     if word.lower() not in self.stop_words:
         if self.word_dict.has_key(word):
             self.word_dict[word].increment_freq()
         else:
             self.word_dict[word] = NGram(1, word, 1)
     else:
         pass
Exemplo n.º 15
0
 def update_dict(self, phrase):
     #if word.lower() not in self.stop_words:
     phrase_str = " ".join(phrase)
     if self.word_dict.has_key(phrase_str):
         self.word_dict[phrase_str].increment_freq()
     else:
         self.word_dict[phrase_str] = NGram(int(self.argv[3]), phrase_str,
                                            1)
Exemplo n.º 16
0
 def test_set_operations(self):
     """Test advanced set operations"""
     items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
     items2 = set(["cdefg", "lmnop"])
     idx1 = NGram(items1)
     idx2 = NGram(items2)
     results = lambda L: sorted(x[0] for x in L)
     # Item removal
     self.assertEqual(results(idx1.search('cde')), ["abcde", "cdefg"])
     idx1.remove('abcde')
     self.assertEqual(results(idx1.search('cde')), ["cdefg"])
     # Set intersection operation
     items1.remove('abcde')
     idx1.intersection_update(idx2)
     self.assertEqual(idx1, items1.intersection(items2))
     self.assertEqual(results(idx1.search('lmn')), [])
     self.assertEqual(results(idx1.search('ijk')), [])
     self.assertEqual(results(idx1.search('def')), ['cdefg'])
 def update_dict(self, word):
     if word in self.attr_dict:
         #if the word in the article has a given attribute
         if self.attribute in self.attr_dict[word]:
             self.attr_count += 1
             if word in self.word_dict:
                 self.word_dict[word].increment_freq()
             else:
                 self.word_dict[word] = NGram(1, word, 1)
     else:
         pass
Exemplo n.º 18
0
def test():
    filter = opFilter()

    opStr1 = "nnn+"
    opStr2 = "nn+"

    idx = NGram(N=config.NGRAM)
    l1 = list(idx.ngrams(opStr1))
    l2 = list(idx.ngrams(opStr2))

    print filter.compareList(l1,l2)
Exemplo n.º 19
0
    def __init__(self, text):
        self.ng = []
        self.topic = []
        file = open(text, "r")

        for linea in file.readlines():
            words = linea.split(" ")
            self.topic += [words[0]]
            nng = NGram(words[2].split(","))
            self.ng += [nng]
        file.close()
Exemplo n.º 20
0
 def update_dict(self, phrase):
     phrase_str = " ".join(phrase)
     if self.word_dict.has_key(phrase_str):
         self.word_dict[phrase_str].increment_freq()
     else:
         self.word_dict[phrase_str] = NGram( \
             len(phrase),                    \
             phrase_str,                     \
             1,                              \
             self.get_sentiment_val(phrase), \
             self.get_topic_ref_count(phrase_str)
         )
Exemplo n.º 21
0
def ngram_similarity(data, col1, col2):
    cos = []
    for i in range(len(data.id)):
        st = data[col1][i]
        title = data[col2][i]
        n = NGram(title.split(), key=lambda x: x[1])
        for s in st.split():
            n.search(s)

        tfidf = sktf.TfidfVectorizer().fit_transform([st, title])
        c = ((tfidf * tfidf.T).A)[0, 1]
        cos.append(c)
    return cos
 def update_dict(self, phrase):
     #if word.lower() not in self.stop_words:
     phrase_str = " ".join(phrase)
     if self.word_dict.has_key(phrase_str):
         self.word_dict[phrase_str].increment_freq()
     else:
         self.word_dict[phrase_str] = NGram( \
             int(self.argv[3]), \
             phrase_str, \
             1, \
             self.get_sentiment_val(phrase),
             0, #don't care about topic_ref
             self.get_welcome_val(phrase)
         )
Exemplo n.º 23
0
    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._load_config(conf_file)
        self.ngram = NGram(model_dir)
Exemplo n.º 24
0
    def test_init_1gram(self):
        ngram = NGram(1, self.sents)

        counts = {
            (): 10,
            ('el', ): 1,
            ('gato', ): 1,
            ('come', ): 2,
            ('pescado', ): 1,
            ('.', ): 2,
            ('la', ): 1,
            ('gata', ): 1,
            ('salmón', ): 1,
        }
        print("ngram.counts:")
        print(dict(ngram.counts))
        self.assertEqual(dict(ngram.counts), counts)
Exemplo n.º 25
0
    def test_count_1gram(self):
        ngram = NGram(1, self.sents)

        counts = {
            (): 12,
            ('el', ): 1,
            ('gato', ): 1,
            ('come', ): 2,
            ('pescado', ): 1,
            ('.', ): 2,
            ('</s>', ): 2,
            ('la', ): 1,
            ('gata', ): 1,
            ('salmón', ): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c)
Exemplo n.º 26
0
    def __init__(self,n,loaded = False):
        """
        Constructor
        
        Parameters
        ----------
        n : int
            ngram size
        """
        self.ngram_gen = NGram(N=n)

        self.size = n
        self.ngram_index = {"":0}
        self.index_ngram = {0:""}
        self.cpt = 0
        self.max_len = 0

        self.loaded = loaded
Exemplo n.º 27
0
def ngram(w1, w2, n):
    """
    ngram distance
    """
    pad = lambda x : "#{}#".format(x)
    w1, w2 = pad(w1), pad(w2)

    g1 = [w1[i:i+n] for i in range(len(w1)-n+1)]
    g2 = [w2[i:i+n] for i in range(len(w2)-n+1)]


    # compute ngram similarity
    # d(a, b) = |a| + |b| + 2|a intersection b|
    n = NGram(g1)
    n.intersection_update(g2)

    d = len(g1) + len(g2) - 2 * len(list(n))
    return d
Exemplo n.º 28
0
    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'),
                         [('askfjwehiuasdfji', 1.0),
                          ('asdfawe', 0.17391304347826086),
                          ('asfwef', 0.083333333333333329),
                          ('adfwe', 0.041666666666666664)])
        self.assertEqual(
            idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998),
                                        ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def get_ngram_similarity(gold, candidates, N=3, strip_space=True):
    def _strip_space(s):
        if not strip_space:
            return s
        return "\n".join([part.strip(" ") for part in s.split("\n")])

    ng = NGram([_strip_space(gold)], N=N)

    sims = []

    for c in candidates:
        ng_out = ng.search(_strip_space(c))
        if len(ng_out) == 0:
            sims.append(0.0)
        else:
            sims.append(ng_out[0][1])

    return sims
Exemplo n.º 30
0
    def __init__(self, lang, uri=None, ngram=False):
        self.client = pymongo.MongoClient(uri)
        self.uri = uri
        self.lang = lang
        self.db = self.client['nlp_{}'.format(self.lang)]
        self.fs = gridfs.GridFS(self.db)

        self.Collect = self.db['pmi']
        self.cpus = math.ceil(mp.cpu_count() * 0.2)
        self.frequency = {}

        if ngram:
            # use ngram for searching
            self.pmiNgram = NGram((i['key']
                                   for i in self.db['pmi'].find({}, {
                                       'key': 1,
                                       '_id': False
                                   })))