예제 #1
0
def str_common_grams(str1, str2, min_len=3, max_len=4):
        '''Return how many times the ngrams (of length min_len to max_len) of str1
        appeared on str2
        '''
        grams1 = list(everygrams(str1, min_len, max_len))
        grams2 = list(everygrams(str2, min_len, max_len))
        return sum(grams2.count(gram) for gram in grams1)
예제 #2
0
 def setup_class(self):
     text = [list("abcd"), list("egdbe")]
     self.trigram_counter = NgramCounter(
         everygrams(sent, max_len=3) for sent in text)
     self.bigram_counter = NgramCounter(
         everygrams(sent, max_len=2) for sent in text)
     self.case = unittest.TestCase()
예제 #3
0
    def setUpClass(cls):

        text = [list("abcd"), list("egdbe")]
        cls.trigram_counter = NgramCounter(
            (everygrams(sent, max_len=3) for sent in text))
        cls.bigram_counter = NgramCounter(
            (everygrams(sent, max_len=2) for sent in text))
예제 #4
0
def custom_sentence_gleu(references, hypothesis, min_len=1, max_len=4):
    from collections import Counter
    from nltk.util import everygrams

    assert len(references) == 1

    hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
    tpfp = sum(hyp_ngrams.values())  # True positives + False positives.

    reference = references[0]

    ref_ngrams = Counter(everygrams(reference, min_len, max_len))
    tpfn = sum(ref_ngrams.values())  # True positives + False negatives.
    overlap_ngrams = ref_ngrams & hyp_ngrams
    tp = sum(overlap_ngrams.values())  # True positives.
    n_all = max(tpfp, tpfn)

    n_match = tp if n_all > 0 else 0

    # corner case: empty corpus or empty references---don't divide by zero!
    if n_all == 0:
        gleu_score = 0.0
    else:
        gleu_score = n_match / n_all

    return gleu_score, n_match, tpfp, tpfn
예제 #5
0
def abstraction(chaine, ancestor):
    if all(isinstance(x, Variable) for x in chaine):
        # print(Type(chaine, ancestor))
        # print()
        # print()
        return
    else:
        if isinstance(chaine, ChaineConcrete):
            # print(Type(chaine, chaine))
            # print()
            spans = everygrams([x for x, y in enumerate(chaine)])

            lignee = [variabilise(chaine, span) for span in list(spans)]
        else:
            # print(Type(chaine, ancestor))
            # print()
            lignee = []
            for i, element in enumerate(chaine):
                debut = [chaine[:i]]
                fin = [chaine[i + 1:]]
                if not isinstance(element, Variable):
                    spans = everygrams([x for x, y in enumerate(element)])
                    for span in spans:
                        chainex = variabilise(element, span)
                        print(chainex, fin)
                        if i == 0:
                            lignee.append(chainex + fin)
                        elif i == len(chaine) - 1:
                            lignee.append(debut + chainex)
                        else:
                            lignee.append(debut + chainex + fin)
    for descendant in lignee:
        abstraction(descendant, chaine)
예제 #6
0
def str_common_grams(str1, str2, min_len=3, max_len=4):
    '''Return how many times the ngrams (of length min_len to max_len) of str1
        appeared on str2
        '''
    grams1 = list(everygrams(str1, min_len, max_len))
    grams2 = list(everygrams(str2, min_len, max_len))
    return sum(grams2.count(gram) for gram in grams1)
예제 #7
0
def corpus_chrf(list_of_references,
                hypotheses,
                min_len=1,
                max_len=6,
                beta=3.0):
    """
    Calculates the corpus level CHRF (Character n-gram F-score), it is the
    micro-averaged value of the sentence/segment level CHRF score.

    CHRF only supports a single reference.

        >>> ref1 = str('It is a guide to action that ensures that the military '
        ...            'will forever heed Party commands').split()
        >>> ref2 = str('It is the guiding principle which guarantees the military '
        ...            'forces always being under the command of the Party').split()
        >>>
        >>> hyp1 = str('It is a guide to action which ensures that the military '
        ...            'always obeys the commands of the party').split()
        >>> hyp2 = str('It is to insure the troops forever hearing the activity '
        ...            'guidebook that party direct')
        >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
        0.4915...

    :param references: a corpus of list of reference sentences, w.r.t. hypotheses
    :type references: list(list(str)) / list(str)
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str)) / list(str)
    :param min_len: The minimum order of n-gram this function should extract.
    :type min_len: int
    :param max_len: The maximum order of n-gram this function should extract.
    :type max_len: int
    :param beta: the parameter to assign more importance to recall over precision
    :type beta: float
    :return: the sentence level CHRF score.
    :rtype: float
    """

    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their references should be the same"

    # Iterate through each hypothesis and their corresponding references.
    for reference, hypothesis in zip(list_of_references, hypotheses):
        # Cheating condition to allow users to input strings instead of tokens.
        if type(reference) and type(hypothesis) != str:
            reference, hypothesis = ' '.join(reference), ' '.join(hypothesis)
        # For each order of ngram, calculate the no. of ngram matches and
        # keep track of no. of ngram in references.
        ref_ngrams = Counter(everygrams(reference, min_len, max_len))
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        overlap_ngrams = ref_ngrams & hyp_ngrams
        tp = sum(overlap_ngrams.values())  # True positives.
        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.
        tffn = sum(ref_ngrams.values())  # True posities + False negatives.

    precision = tp / tpfp
    recall = tp / tffn
    factor = beta**2
    score = (1 + factor) * (precision * recall) / (factor * precision + recall)
    return score
예제 #8
0
    def setUpClass(cls):

        text = [list("abcd"), list("egdbe")]
        cls.trigram_counter = NgramCounter(
            (everygrams(sent, max_len=3) for sent in text)
        )
        cls.bigram_counter = NgramCounter(
            (everygrams(sent, max_len=2) for sent in text)
        )
예제 #9
0
def grams(text):
    #Character grams
    for i in list(
            everygrams(''.join([c for c in text if c != ' ']),
                       min_len=1,
                       max_len=4)):
        yield i
    #Word grams
    for i in list(everygrams(text.split(' '), min_len=1, max_len=3)):
        yield i
예제 #10
0
 def kmers(self, content):
     try:
         size = int(self.size)
         ngramG = ngrams(content, size)
         return [''.join(i) for i in list(ngramG)]
     except:
         size = self.size.replace('to', ' ').split(' ')
         minsize = size[0]
         maxsize = size[1]
         everygrams(content, minsize, maxsize)
예제 #11
0
def custom_corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
    """
    Copy of the GLEU implementation in NLTK that also returns n_match and n_all
    :param list_of_references:
    :param hypotheses:
    :param min_len:
    :param max_len:
    :return:
    """
    # sanity check
    assert len(list_of_references) == len \
        (hypotheses), "The number of hypotheses and their reference(s) should be the same"

    # sum matches and max-token-lengths over all sentences
    corpus_n_match = 0
    corpus_n_all = 0

    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.

        hyp_counts = []
        for reference in references:
            ref_ngrams = Counter(everygrams(reference, min_len, max_len))
            tpfn = sum(
                ref_ngrams.values())  # True positives + False negatives.

            overlap_ngrams = ref_ngrams & hyp_ngrams
            tp = sum(overlap_ngrams.values())  # True positives.

            # While GLEU is defined as the minimum of precision and
            # recall, we can reduce the number of division operations by one by
            # instead finding the maximum of the denominators for the precision
            # and recall formulae, since the numerators are the same:
            #     precision = tp / tpfp
            #     recall = tp / tpfn
            #     gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
            n_all = max(tpfp, tpfn)

            if n_all > 0:
                hyp_counts.append((tp, n_all))

        # use the reference yielding the highest score
        if hyp_counts:
            n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
            corpus_n_match += n_match
            corpus_n_all += n_all

    # corner case: empty corpus or empty references---don't divide by zero!
    if corpus_n_all == 0:
        gleu_score = 0.0
    else:
        gleu_score = corpus_n_match / corpus_n_all

    return gleu_score, corpus_n_match, corpus_n_all
예제 #12
0
파일: chrf_score.py 프로젝트: DrDub/nltk
def corpus_chrf(list_of_references, hypotheses, min_len=1, max_len=6, beta=3.0):
    """
    Calculates the corpus level CHRF (Character n-gram F-score), it is the
    micro-averaged value of the sentence/segment level CHRF score.

    CHRF only supports a single reference.

        >>> ref1 = str('It is a guide to action that ensures that the military '
        ...            'will forever heed Party commands').split()
        >>> ref2 = str('It is the guiding principle which guarantees the military '
        ...            'forces always being under the command of the Party').split()
        >>>
        >>> hyp1 = str('It is a guide to action which ensures that the military '
        ...            'always obeys the commands of the party').split()
        >>> hyp2 = str('It is to insure the troops forever hearing the activity '
        ...            'guidebook that party direct')
        >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
        0.4915...

    :param references: a corpus of list of reference sentences, w.r.t. hypotheses
    :type references: list(list(str)) / list(str)
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str)) / list(str)
    :param min_len: The minimum order of n-gram this function should extract.
    :type min_len: int
    :param max_len: The maximum order of n-gram this function should extract.
    :type max_len: int
    :param beta: the parameter to assign more importance to recall over precision
    :type beta: float
    :return: the sentence level CHRF score.
    :rtype: float
    """

    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their references should be the same"

    # Iterate through each hypothesis and their corresponding references.
    for reference, hypothesis in zip(list_of_references, hypotheses):
        # Cheating condition to allow users to input strings instead of tokens.
        if type(reference) and type(hypothesis) != str:
            reference, hypothesis = ' '.join(reference), ' '.join(hypothesis)
        # For each order of ngram, calculate the no. of ngram matches and
        # keep track of no. of ngram in references.
        ref_ngrams = Counter(everygrams(reference, min_len, max_len))
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        overlap_ngrams = ref_ngrams & hyp_ngrams
        tp = sum(overlap_ngrams.values()) # True positives.
        tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
        tffn = sum(ref_ngrams.values()) # True posities + False negatives.

    precision = tp / tpfp
    recall = tp / tffn
    factor = beta**2
    score = (1+ factor ) * (precision * recall) / ( factor * precision + recall)
    return score
예제 #13
0
 def _compute_rouge(self, words_t, words_c):
     ngrams_t = set(everygrams(words_t, max_len=self.max_gram))
     ngrams_c = everygrams(words_c, max_len=self.max_gram)
     match_count = 0
     total_count = 0
     for ngram_c in ngrams_c:
         total_count += 1
         if ngram_c in ngrams_t:
             match_count += 1
     if total_count == 0:
         warnings.warn('empty template for title:{}'.format(
             ' '.join(words_t)))
         return 0
     score = match_count / total_count
     return score
예제 #14
0
def ngram_counts(tokens,
                 min_len=1,
                 max_len=None,
                 transform=" ".join,
                 in_vocabulary=lambda _: True):
    """
    Compute n-gram counts using toolz and Counter

    :param tokens: Iterable[str]
    :param min_len: int Minimum N-Gram size
    :param max_len: int Maximum N-Gram size
    :param transfrom: Callable[[Tuple[str, ...], str]] Function transforming ngram tuple into key
    :param in_vocabulary: Callable[[str], bool] Should token be preserved
    :return: Dict[str, int]
    """
    tokens = list(tokens)
    wc = len(tokens)
    max_len = (max_len if max_len else wc) + 1
    return (
        wc,
        pipe(
            everygrams(tokens, min_len=min_len, max_len=max_len),
            map(transform),
            filter(in_vocabulary),
            frequencies,
        ),
    )
예제 #15
0
    def evaluate_context(self, sentence, word):
        sentTokens = sentence.split()

        if word in sentTokens:
            index = sentTokens.index(word)
        else:
            return False
        if index == 0:
            left = index
            right = min(len(sentTokens), index + self.left + self.right)
        elif index == len(sentTokens) - 1:
            right = index
            left = max(0, index - self.right - self.left)
        else:
            left = max(0, index - self.left)
            right = min(len(sentTokens) - 1, index + self.right)

        scores = []
        tokens = sentTokens[left:right + 1]
        nGrams = list(everygrams(tokens))
        possibleNgrams = [n for n in nGrams if word in n]
        wordScores = []
        for n in possibleNgrams:
            #             print("N gram is " , n )
            bosFlag = (sentTokens.index(n[0]) == 0 and left == 0)
            eosFlag = (sentTokens.index(n[len(n) - 1]) == len(sentTokens) - 1
                       and right == len(sentTokens) - 1)
            #             print(" Eos flag ",eosFlag , " BOS flag",bosFlag)
            wordScores.append(
                self.langModel.score(" ".join(n), eos=eosFlag, bos=bosFlag))
        #print("Candidate : ", candidate)
        #         print(candidateScores)
        scores.append(np.average(wordScores))
        return np.average(wordScores)
예제 #16
0
def get_ngrams(max_len, tokens, delimiter=" "):
    """Get ngrams (sequences of consecutive tokens) from tokens.

    ngrams are sequences of consecutive tokens. Return an iterator of all
    ngrams of length at most max_len represented as the concatenation of the
    constituent tokens, delimited by delimiter.

    Args:
        max_len (int): Max length of ngram to consider.
        tokens (iterable of str): Token iterator.
        delimiter (str, optional): Separator to use between tokens in an ngram.

    Returns:
        iterable of str: String representations of each ngram.

    Examples:
        To use `get_ngrams` directly on an iterable of tokens:

        >>> list(get_ngrams(2, ["a", "b", "c"]))
        ['a', 'b', 'c', 'a b', 'b c']

        To use `get_ngrams` on a stream of token iterables:

        >>> tokens_gen = iter([["a", "b", "c"],
        ...                    ["d", "e", "f"]])
        >>> from functools import partial
        >>> ngrams_gen = map(partial(get_ngrams, 2), tokens_gen)
        >>> from twitter_analysis_tools.utils import listify_nested_iterables
        >>> listify_nested_iterables(ngrams_gen)
        [['a', 'b', 'c', 'a b', 'b c'], ['d', 'e', 'f', 'd e', 'e f']]
    """
    # Gotcha: everygrams doesn't work with iterables. Only lists.
    ngrams = everygrams(list(tokens), max_len=max_len)
    return map(delimiter.join, ngrams)
예제 #17
0
def test_everygrams_min_len(everygram_input):
    expected_output = [
        ("a", "b"),
        ("a", "b", "c"),
        ("b", "c"),
    ]
    output = list(everygrams(everygram_input, min_len=2))
    assert output == expected_output
예제 #18
0
 def test_everygrams_min_len(self):
     expected_output = [
         ('a', 'b'),
         ('b', 'c'),
         ('a', 'b', 'c'),
     ]
     output = everygrams(self.test_data, min_len=2)
     self.assertCountEqual(output, expected_output)
def extract_features(document):
    words = word_tokenize(document)
    lemmas = [str(lemmatizer.lemmatize(w)) for w in words if w not in stopwords_eng and w not in punctuation]
    document = " ".join(lemmas)
    document = document.lower()
    document = re.sub(r'[^a-zA-Z0-9\s]', ' ', document)
    words = [w for w in document.split(" ") if w!="" and w not in stopwords_eng and w not in punctuation]
    return [str('_'.join(ngram)) for ngram in list(everygrams(words, max_len=3))]
def findScoreForAlignmentUsingnGrams(srcSent, tgtList, transDict):
    """Find alignment score for a source sentence with a list of target sentences."""
    wordsInSourceSent = word_tokenize(srcSent.lower())
    wordsInSrc = len(wordsInSourceSent)
    sourceNgrams = list(everygrams(wordsInSourceSent, max_len=2))
    sourceNgrams = create_string_ngrams(sourceNgrams)
    srcDict = Counter(sourceNgrams)
    scores = list()
    tgtDicts, allTgtWords = list(), list()
    for tgt in tgtList:
        wordsInTgt = word_tokenize(tgt.lower())
        allTgtWords.append(wordsInTgt)
        tgtNgrams = list(everygrams(wordsInTgt, max_len=2))
        tgtNgrams = create_string_ngrams(tgtNgrams)
        tgtDicts.append(Counter(tgtNgrams))
    print(len(transDict))
    for index, tgtDict in enumerate(tgtDicts):
        count = 0
        matchedNgrams = list()
        for src in srcDict:
            if re.search('\d+(\.\d+)?', src):
                if src in tgtDict:
                    matchedNgrams.append((src, src))
                    count += 1
            elif src in transDict:
                foundTgt = transDict[src]
                for ngrm in foundTgt:
                    if ngrm in tgtDict and tgtDict[ngrm] == srcDict[src]:
                        matchedNgrams.append((src, ngrm))
                        count += tgtDict[ngrm]
                        break
                    elif ngrm in tgtDict and tgtDict[ngrm] != srcDict[src]:
                        break
        wordsInTgt = len(allTgtWords[index])
        if abs(wordsInSrc - wordsInTgt) in range(5):
            lengthValue = 0.2
        else:
            lengthValue = 1 / abs(wordsInSrc - wordsInTgt)
        print(count)
        print(matchedNgrams)
        if count == 0.:
            scores.append((1e-5, lengthValue))
        else:
            scores.append((count / len(sourceNgrams), lengthValue))
    print(scores)
    return np.array(scores)
def get_ngram_word_dict(emotion_line):
    words = nltk.word_tokenize(emotion_line)
    word_ngram = everygrams(words, min_len=1, max_len=3)

    word_feats = {}
    for w in word_ngram:
        if w not in word_feats:
            word_feats[w] = "feature_word"

    return word_feats
예제 #22
0
def test_everygrams_without_padding(everygram_input):
    expected_output = [
        ("a",),
        ("a", "b"),
        ("a", "b", "c"),
        ("b",),
        ("b", "c"),
        ("c",),
    ]
    output = list(everygrams(everygram_input))
    assert output == expected_output
예제 #23
0
 def test_everygrams_without_padding(self):
     expected_output = [
         ('a', ),
         ('a', 'b'),
         ('a', 'b', 'c'),
         ('b', ),
         ('b', 'c'),
         ('c', ),
     ]
     output = everygrams(self.test_data)
     self.assertCountEqual(output, expected_output)
예제 #24
0
def encode_sentences(txt):
	feature_set=np.zeros((len(txt), len(word_set)+1),dtype=int)
	tnum=0
	for t in txt:
		s_words=t[1:]+list(set(list(everygrams(t[1:], min_len=2,max_len=2))))
		for w in s_words:
			idx=word_idx[w]
			feature_set[tnum][idx]=1
		feature_set[tnum][-1]=t[0]
		tnum+=1
	return feature_set
예제 #25
0
 def initialiseCorrespondances(self):
     for couple in self.corpus:
         (langue1,langue2) = couple
         for correspond1 in self.correspondancesLangue1:
             if correspond1 in [" ".join(x) for x in everygrams(langue1.split(' '))]:
                 if couple not in self.correspondancesLangue1[correspond1]:
                     self.correspondancesLangue1[correspond1].append(couple)
         for correspond2 in self.correspondancesLangue2:
             if correspond2 in langue2:
                 if couple not in self.correspondancesLangue2[correspond2]:
                     self.correspondancesLangue2[correspond2].append(couple)
예제 #26
0
    def get_score(self, sentTokens, word, candidate):
        #         print(word , " " , candidate)

        if len(word.split()) > 1:
            sentence = " ".join(sentTokens)
            sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
            sentence = sentence.replace(word, "#")
            sentTokens = sentence.split()
            if "#" not in sentTokens:
                print("MOSHKLAAAAAAAAAA TNYAAA 3ND ", self.left, " MODEL")
                print(sentence)
                print(word, " ", candidate)
                index = 0
            else:
                index = sentTokens.index("#")
            sentTokens[index] = word
        if word in sentTokens:
            index = sentTokens.index(word)
        else:
            print("MOSHKLA KBERRRRRRAAAAAAA 3ND ", self.left, " MODEL")
            print(word, "###", candidate, sentTokens)
            index = 0
        ## Get window words
        if index == 0:
            left = index
            right = min(len(sentTokens), index + self.left + self.right)
        elif index == len(sentTokens) - 1:
            right = index
            left = max(0, index - self.right - self.left)
        else:
            left = max(0, index - self.left)
            right = min(len(sentTokens) - 1, index + self.right)
        scores = []
        tokens = sentTokens[left:right + 1]
        #print(tokens)

        sentTokens[
            index] = candidate  ## Put candidate in sentence to test its score
        tokens = sentTokens[left:right + 1]
        nGrams = list(everygrams(tokens))
        possibleNgrams = [n for n in nGrams if candidate in n]
        candidateScores = []
        for n in possibleNgrams:
            #             print("N gram is " , n )
            bosFlag = (sentTokens.index(n[0]) == 0 and left == 0)
            eosFlag = (sentTokens.index(n[len(n) - 1]) == len(sentTokens) - 1
                       and right == len(sentTokens) - 1)
            #             print(" Eos flag ",eosFlag , " BOS flag",bosFlag)
            candidateScores.append(
                self.langModel.score(" ".join(n), eos=eosFlag, bos=bosFlag))
        #print("Candidate : ", candidate)
        #         print(candidateScores)
        scores.append(np.average(candidateScores))
        return np.average(candidateScores)
예제 #27
0
def read_trigrams():
    training = []
    vocab = []
    text = []
    with open('trigrams.pkl', 'rb') as f:
        data = pickle.load(f)
        for d in data:
            trigrams = list(everygrams(d, max_len=3))
            training.append(trigrams)
            for word in d:
                vocab.append(word)
            text.append(d[0])
    return training, set(vocab), text
def convert_sentence_to_ngrams(inp_sentence: str,
                               n_param=3,
                               add_unknown=False) -> List[str]:
    '''
    Convert the input sentence to trigrams using a tokenizer
    '''
    tokenized_input = wordpunct_tokenize(inp_sentence)

    # and an unknown token behind the
    if add_unknown:
        tokenized_input = ['<UNK>'] + tokenized_input

    return everygrams(tokenized_input, min_len=n_param, max_len=n_param)
예제 #29
0
 def EstimateNgrams(self, training_set):
     for sent in training_set:
         sent = list(sent)
         for ngram in everygrams(sent, max_len=self.max_n):
             n = len(ngram)
             self.model_map[n][ngram] += 1
     for n in self.model_map:
         for ngram in self.model_map[n]:
             if n == 1:
                 pass
             else:
                 self.log_probs[ngram] = log(
                     self.model_map[n][ngram] /
                     self.model_map[n-1][ngram[:-1]]
                 )
예제 #30
0
 def create_ngrams(input, min_len, max_len):
     """
     create N-grams
     min_len is the minimum length of the N-grams
     max_len is the maximum length of the N-grams
     :param input:
     :param min_len:
     :param max_len:
     :return:
     """
     result = []
     for sent in input:
       sent_split = sent.split()
       result.append(list(everygrams(sent_split, min_len=min_len, max_len=max_len)))
     return result
예제 #31
0
def model_iterator(n):
        perp = []
        n = n+1
        for n in range(1,n):
            print(n)
            train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)
            #model = MLE(n)
            #model = Laplace(n) #only add-one smoothing here
            #model = Lidstone(0.1,n) #Lidstones second number is Gamma/Alpha/Delta
            #model = WittenBellInterpolated(n)
            model = KneserNeyInterpolated(n, discount = 0.88) #only order and discount needed, WB only order
            print(n,model)
            model.fit(train_data, padded_sents)
            print(model.vocab)
            vocab_list = []
            for word in model.vocab:
                vocab_list.append(word)
            #print(vocab_list)
            print("value",model. score('<UNK>'))
            #print(generate_sent_text_seed(model, 30, random_seed=['thicc']))

            #print(generate_sent(model, 50, random_seed = 30))
            entropy_fin = 0
            lense = 1000
            i = 0
            for z in range(lense):
                #print(contents[i])
                tokenized_test = [list(map(str.lower, word_tokenize(contents[i])))]
                if len(tokenized_test[0]) > 0:
                    for g in range(len(tokenized_test[0])):
                        if tokenized_test[0][g] not in vocab_list:
                            tokenized_test[0][g] = '<UNK>'
                    test_text_pad = list(flatten(pad_both_ends(sent, n) for sent in tokenized_test))
                    test_text_everygram = list(everygrams(test_text_pad, max_len=n))
                    #print(test_text_everygram)
                    #test_data, padded_sents_test = padded_everygram_pipeline(n, tokenized_test)
                    #print(i)
                    #print(model.entropy(test_text_bigram))
                    #print(model.entropy(test_text_everygram))
                    entropy_fin += model.entropy(test_text_everygram)
                i += 1
            print(entropy_fin)
            avg_entr = entropy_fin/lense
            print("perplexity",2**avg_entr)
            perp.append([n,2**avg_entr])
        import pandas as pd
        DF = pd.DataFrame(perp)
        return DF
예제 #32
0
def train_texts(train_files, exclude, extension, n_ngram):
    # Training data file
    # train_data_file = "./train/treino.txt"

    # read training data
    #train_data_files = glob.glob('./train/*' + extension)
    train_data_files = train_files.copy()

    if (exclude):
        print("Arquivos no diretorio do treino antes de remover o item do test: ", train_data_files)
        train_data_files.remove(exclude)

    print("Arquivos utilizados no treino: ", train_data_files)

    train_texts = ''

    for train_data_file in train_data_files:

        try:
            #path_file_train =
            with open(os.path.join("./train", train_data_file), encoding='utf-8') as f:
                train_text = f.read().lower()
        except:
            print("Não foi possível acessar os arquivos de treino com a extensão ." + extension + " no diretório train.")

        # apply preprocessing (remove text inside square and curly brackets and rem punc)
        train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
        train_text = re.sub(r'[^\w\s]', "", train_text)
        train_texts += train_text

    # pad the text and tokenize
    training_data = list(pad_sequence(word_tokenize(train_texts), n_ngram,
                                      pad_left=True,
                                      left_pad_symbol="<s>"))

    print("training_data", training_data)

    # generate ngrams
    ngrams = list(everygrams(training_data, max_len=n_ngram))
    print("Number of ngrams:", len(ngrams))

    # build ngram language models
    model = WittenBellInterpolated(n_ngram)
    model.fit([ngrams], vocabulary_text=training_data)
    print(model.vocab)

    return model
예제 #33
0
def test_everygrams_pad_left(everygram_input):
    expected_output = [
        (None,),
        (None, None),
        (None, None, "a"),
        (None,),
        (None, "a"),
        (None, "a", "b"),
        ("a",),
        ("a", "b"),
        ("a", "b", "c"),
        ("b",),
        ("b", "c"),
        ("c",),
    ]
    output = list(everygrams(everygram_input, max_len=3, pad_left=True))
    assert output == expected_output
예제 #34
0
def padded_everygram_pipeline(order, text):
    """Default preprocessing for a sequence of sentences.

    Creates two iterators:
    - sentences padded and turned into sequences of `nltk.util.everygrams`
    - sentences padded as above and chained together for a flat stream of words

    :param order: Largest ngram length produced by `everygrams`.
    :param text: Text to iterate over. Expected to be an iterable of sentences:
    Iterable[Iterable[str]]
    :return: iterator over text as ngrams, iterator over text as vocabulary data
    """
    padding_fn = partial(pad_both_ends, n=order)
    return (
        (everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
        flatten(map(padding_fn, text)),
    )
예제 #35
0
 def test_everygrams_pad_left(self):
     expected_output = [
         (None, ),
         (None, None),
         (None, None, 'a'),
         (None, ),
         (None, 'a'),
         (None, 'a', 'b'),
         ('a', ),
         ('a', 'b'),
         ('a', 'b', 'c'),
         ('b', ),
         ('b', 'c'),
         ('c', ),
     ]
     output = everygrams(self.test_data, max_len=3, pad_left=True)
     self.assertCountEqual(output, expected_output)
예제 #36
0
파일: LM.py 프로젝트: czly/DataScienceFinal
def build_one_chat_LM(fb_dict, max_len=3):
    """
        input: one chat room
        output: LM of that chatroom
    """
    final_list = []
    for line in fb_dict['msgs']:
        line = line[2]
        final_list += list(everygrams(line,
                                      max_len=max_len,
                                      pad_left=True,
                                      pad_right=True,
                                      left_pad_symbol='<s>',
                                      right_pad_symbol='<\s>'))

    Counter_LM = Counter(final_list)
    
    total_count = sum(Counter_LM.values())
    total_count = float(total_count)
    for key in Counter_LM:
        Counter_LM[key] /= total_count

    return Counter_LM
예제 #37
0
def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
    """
    Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all
    the hypotheses and their respective references.

    Instead of averaging the sentence level GLEU scores (i.e. macro-average
    precision), Wu et al. (2016) sum up the matching tokens and the max of
    hypothesis and reference tokens for each sentence, then compute using the
    aggregate values.

    From Mike Schuster (via email):
        "For the corpus, we just add up the two statistics n_match and
         n_all = max(n_all_output, n_all_target) for all sentences, then
         calculate gleu_score = n_match / n_all, so it is not just a mean of
         the sentence gleu scores (in our case, longer sentences count more,
         which I think makes sense as they are more difficult to translate)."

    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...         'ensures', 'that', 'the', 'military', 'always',
    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...          'heed', 'Party', 'commands']
    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...          'guarantees', 'the', 'military', 'forces', 'always',
    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
    ...          'of', 'the', 'party']

    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
    ...         'interested', 'in', 'world', 'history']
    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
    ...          'because', 'he', 'read', 'the', 'book']

    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
    >>> hypotheses = [hyp1, hyp2]
    >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
    0.5673...

    The example below show that corpus_gleu() is different from averaging
    sentence_gleu() for hypotheses

    >>> score1 = sentence_gleu([ref1a], hyp1)
    >>> score2 = sentence_gleu([ref2a], hyp2)
    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
    0.6144...

    :param list_of_references: a list of reference sentences, w.r.t. hypotheses
    :type list_of_references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param min_len: The minimum order of n-gram this function should extract.
    :type min_len: int
    :param max_len: The maximum order of n-gram this function should extract.
    :type max_len: int
    :return: The corpus-level GLEU score.
    :rtype: float
    """
    # sanity check
    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"

    # sum matches and max-token-lengths over all sentences
    corpus_n_match = 0
    corpus_n_all = 0

    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.
        
        hyp_counts = []
        for reference in references:
            ref_ngrams = Counter(everygrams(reference, min_len, max_len))
            tpfn = sum(ref_ngrams.values())  # True positives + False negatives.

            overlap_ngrams = ref_ngrams & hyp_ngrams
            tp = sum(overlap_ngrams.values())  # True positives.

            # While GLEU is defined as the minimum of precision and
            # recall, we can reduce the number of division operations by one by
            # instead finding the maximum of the denominators for the precision
            # and recall formulae, since the numerators are the same:
            #     precision = tp / tpfp
            #     recall = tp / tpfn
            #     gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
            n_all = max(tpfp, tpfn)

            if n_all > 0:
                hyp_counts.append((tp, n_all))

        # use the reference yielding the highest score
        if hyp_counts:
            n_match, n_all = max(hyp_counts, key=lambda hc: hc[0]/hc[1])
            corpus_n_match += n_match
            corpus_n_all += n_all

    # corner case: empty corpus or empty references---don't divide by zero!
    if corpus_n_all == 0:
        gleu_score = 0.0
    else:
        gleu_score = corpus_n_match / corpus_n_all

    return gleu_score
예제 #38
0
파일: gleu_score.py 프로젝트: DrDub/nltk
def sentence_gleu(reference, hypothesis, min_len=1, max_len=4):
    """
    Calculates the sentence level GLEU (Google-BLEU) score described in

        Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
        Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
        Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
        Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
        George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith,
        Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes,
        Jeffrey Dean. (2016) Google’s Neural Machine Translation System:
        Bridging the Gap between Human and Machine Translation.
        eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf
        Retrieved on 27 Oct 2016.

    From Wu et al. (2016):
        "The BLEU score has some undesirable properties when used for single
         sentences, as it was designed to be a corpus measure. We therefore
         use a slightly different score for our RL experiments which we call
         the 'GLEU score'. For the GLEU score, we record all sub-sequences of
         1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
         compute a recall, which is the ratio of the number of matching n-grams
         to the number of total n-grams in the target (ground truth) sequence,
         and a precision, which is the ratio of the number of matching n-grams
         to the number of total n-grams in the generated output sequence. Then
         GLEU score is simply the minimum of recall and precision. This GLEU
         score's range is always between 0 (no matches) and 1 (all match) and
         it is symmetrical when switching output and target. According to
         our experiments, GLEU score correlates quite well with the BLEU
         metric on a corpus level but does not have its drawbacks for our per
         sentence reward objective."

    Note: The GLEU score is designed for sentence based evaluation thus there is
          no corpus based scores implemented in NLTK.

    The infamous "the the the ... " example

        >>> ref = 'the cat is on the mat'.split()
        >>> hyp = 'the the the the the the the'.split()
        >>> sentence_gleu(ref, hyp)  # doctest: +ELLIPSIS
        0.0909...

    An example to evaluate normal machine translation outputs

        >>> ref1 = str('It is a guide to action that ensures that the military '
        ...            'will forever heed Party commands').split()
        >>> hyp1 = str('It is a guide to action which ensures that the military '
        ...            'always obeys the commands of the party').split()
        >>> hyp2 = str('It is to insure the troops forever hearing the activity '
        ...            'guidebook that party direct').split()
        >>> sentence_gleu(ref1, hyp1) # doctest: +ELLIPSIS
        0.4393...
        >>> sentence_gleu(ref1, hyp2) # doctest: +ELLIPSIS
        0.1206...

    :param references: reference sentence
    :type references: list(str)
    :param hypothesis: a hypothesis sentence
    :type hypothesis: list(str)
    :param min_len: The minimum order of n-gram this function should extract.
    :type min_len: int
    :param max_len: The maximum order of n-gram this function should extract.
    :type max_len: int
    :return: the sentence level CHRF score.
    :rtype: float
    """
    # For each order of ngram, calculate the no. of ngram matches and
    # keep track of no. of ngram in references.
    ref_ngrams = Counter(everygrams(reference, min_len, max_len))
    hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
    overlap_ngrams = ref_ngrams & hyp_ngrams
    tp = sum(overlap_ngrams.values()) # True positives.
    tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
    tffn = sum(ref_ngrams.values()) # True posities + False negatives.

    precision = tp / tpfp
    recall = tp / tffn

    return min(precision, recall)
예제 #39
0
 def remplirLexique(self):
     for phrase in self.corpusLangue1:
         self.lexiqueLangue1 |= set([" ".join(x) for x in everygrams(phrase.split(' '))])
     for phras in self.corpusLangue2:
         self.lexiqueLangue2 |= set(["".join(x) for x in everygrams(list(phras))])
예제 #40
0
def padded_everygrams(order, sentence):
    """Helper with some useful defaults.

    Applies pad_both_ends to sentence and follows it up with everygrams.
    """
    return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
예제 #41
0
def firstPassGrouping():
    words = []

    stemmed = []
    features = {}
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    clean = re.compile("[()\/']")
    split = re.compile("[/]")
    grams = []
    with open('data/features.txt', 'r') as featureIn:
        for line in map(cleanFeatures, featureIn):
            ws = []
            for w in tokenizer.tokenize(clean.sub(' ', line[1])):
                if w not in engStop:
                    stemmed.append((eng.stem(w).lower(), line[1]))
                    words.append((w.lower(), line[1]))
                    ws.append(w.lower())

            grams.append((list(everygrams(ws, min_len=2, max_len=2)), line[1]))
            features[line[0]] = line[1]


    # cuisine, style, price, atmosphere, and occasion


    noGrams = set(map(lambda x: x[1], filter(lambda x: len(x[0]) == 0, grams)))

    grams = list(filter(lambda x: len(x[0]) > 0, grams))
    groupedw = seq(grams) \
        .flat_map(lambda x: set([(w, x[1]) for w in seq(x[0]).flat_map(lambda y: list(y)).to_list()])) \
        .group_by(lambda w: w[0]) \
        .map(lambda x: (x[0], list(map(lambda y: y[1], x[1])))) \
        .to_dict()

    noGramsId = {}
    for g in noGrams:
        noGramsId[g] = g
    simGrouped = {}
    simular = set()
    for k, v in sorted(groupedw.items(), key=lambda x: x[0]):
        # print(k, v)
        nl = v.copy()
        match = noGramsId.get(k, None)
        for nk in noGramsId.keys():
            if len(nk) > 1:
                if nk in v:
                    nl.append(nk)
                    simular.add(nk)
                for vv in v:
                    if nk in vv:
                        nl.append(nk)
                        simular.add(nk)

        if match is not None:
            nl.append(match)
            simGrouped[k] = list(set(nl))
            simular.add(match)
        else:
            if len(k) > 1:
                simGrouped[k] = v

    noSim = noGrams - simular
    #
    nationalities = gazetteers.words()

    featureNationality = []
    for nosim in noSim:
        didConvert = convert(nosim)
        if didConvert is not None:
            if didConvert in nationalities:
                featureNationality.append(nosim)
        else:
            if nosim in nationalities:
                featureNationality.append(nosim)
            else:
                split = nosim.split('-')
                for sp in split:
                    if sp in nationalities:
                        featureNationality.append(nosim)

    # print("-----------------")


    noSim = noSim - set(featureNationality)
    # occasions = ['monday']
    # # cuisine, style, price, atmosphere, and occasion
    for k, v in sorted(simGrouped.items(), key=lambda x: x[0]):
        # print(k,v)
        if k in nationalities:
            featureNationality.append(k)
            featureNationality.extend(v)
            simGrouped.pop(k)
        didConvert = convert(k)
        if didConvert is not None:
            if didConvert in nationalities:
                simGrouped.pop(k)
                featureNationality.append(k)
                featureNationality.extend(v)

    with open('q1/noSim.json', 'w+') as nsOut:
        nsOut.write(json.dumps(list(noSim), indent=2, sort_keys=True))

    with open('q1/featureNationality.json', 'w+') as nsOut:
        nsOut.write(json.dumps(featureNationality, indent=2, sort_keys=True))

    with open('q1/grouped.json', 'w+') as nsOut:
        nsOut.write(json.dumps(simGrouped, indent=2, sort_keys=True))