示例#1
0
文件: rouge.py 项目: 53X/NLP-Metrics
    def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False):

        rouge_s_list = []
        k_c = len(candidate) if d_skip is None else d_skip
        cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate),
                              n=2, k=k_c))
        for ref in references:
            k_ref = len(ref) if d_skip is None else d_skip
            ref_skip_list = list(skipgrams(tokenizer.tokenize(ref),
                                 n=2, k=k_ref))
            count = 0
            for bigram in cand_skip_list:
                if bigram in ref_skip_list:
                    count = count+1
            if not smoothing:
                r_skip = count/len(ref_skip_list)
                p_skip = count/len(cand_skip_list)
            else:
                cand_ungm = list(ngrams(tokenizer.tokenize(candidate),
                                      n=1))
                ref_ungm = list(ngrams(tokenizer.tokenize(ref),
                                     n=1))
                for ungm in cand_ungm:
                    if ungm in ref_ungm:
                        count += 1
                r_skip = count/(len(ref_skip_list)+len(ref_ungm))
                p_skip = count/(len(cand_skip_list)+len(cand_ungm))
            score = Rouge.get_score(r_skip, p_skip, beta)           
            rouge_s_list.append(score)
        return Rouge.jacknifing(rouge_s_list, averaging=averaging)
示例#2
0
    def rouge_s(references,
                candidate,
                beta,
                d_skip=None,
                averaging=True,
                smoothing=False):

        rouge_s_list = []
        k_c = len(candidate) if d_skip is None else d_skip
        cand_skip_list = list(
            skipgrams(tokenizer.tokenize(candidate), n=2, k=k_c))
        for ref in references:
            k_ref = len(ref) if d_skip is None else d_skip
            ref_skip_list = list(
                skipgrams(tokenizer.tokenize(ref), n=2, k=k_ref))
            count = 0
            for bigram in cand_skip_list:
                if bigram in ref_skip_list:
                    count = count + 1
            if not smoothing:
                r_skip = count / len(ref_skip_list)
                p_skip = count / len(cand_skip_list)
            else:
                cand_ungm = list(ngrams(tokenizer.tokenize(candidate), n=1))
                ref_ungm = list(ngrams(tokenizer.tokenize(ref), n=1))
                for ungm in cand_ungm:
                    if ungm in ref_ungm:
                        count += 1
                r_skip = count / (len(ref_skip_list) + len(ref_ungm))
                p_skip = count / (len(cand_skip_list) + len(cand_ungm))
            score = Rouge.get_score(r_skip, p_skip, beta)
            rouge_s_list.append(score)
        return Rouge.jacknifing(rouge_s_list, averaging=averaging)
示例#3
0
 def skipgrams(self,
               n,
               k,
               words=False,
               filtrate=False,
               lower=True,
               **kwargs):
     method = self.words if words else self.lemmas
     yield from nltk.skipgrams(method(filtrate=filtrate, lower=lower), n, k,
                               **kwargs)
def skip_grams(tokens, n, k):
    skip_gram_value = 0
    a = [x for x in nltk.skipgrams(tokens, n, k)]
    for j in range(len(a)):
        for k in range(n):
            ss = sid.polarity_scores(a[j][k])
            if (ss["pos"] == 1):
                skip_gram_value += 1
            if (ss["neg"] == 1):
                skip_gram_value -= 1
    return skip_gram_value
    def __call__(self, doc):
        tokens = list(self.sentiment_aware_tokenize(doc))

        if self.negate:
            tokens = nltk.sentiment.util.mark_negation(tokens)

        if self.n == 1:
            return tokens

        skipgrams = list(nltk.skipgrams(tokens, self.n, self.k))
        return list([' '.join(s) for s in skipgrams])
def SkipBigrams(x):
    all_skip_bigrams = []
    final_list = []
    for sentence in x:
        all_skip_bigrams.append(nltk.skipgrams(sentence, 2, 5))

    for skipgram_sentence in all_skip_bigrams:
        for tupl in skipgram_sentence:
            final_list.append(tupl[0] + ' ' + tupl[1])

    return final_list
def skip_grams(tokens, n, k):
    skip_gram_value = 0     #a history parameter for skipgram that defines the context
    a = [x for x in nltk.skipgrams(tokens, n, k)]
    for j in range(len(a)):
        for k in range(n):
            ss = sid.polarity_scores(a[j][k])
            if (ss["pos"] == 1):
                skip_gram_value += 1
            if (ss["neg"] == 1):
                skip_gram_value -= 1
    return skip_gram_value
    def __call__(self, doc):
        tokens = list(self.sentiment_aware_tokenize(doc))

        if self.negate:
            tokens = nltk.sentiment.util.mark_negation(tokens)

        if self.n == 1:
            return tokens

        skipgrams = list(nltk.skipgrams(tokens, self.n, self.k))
        return list([' '.join(s) for s in skipgrams])
示例#9
0
def skip_grams(tokens, n, k):
    skip_gram_value = 0
    # tokens = clean_data(tweet, lemmatize= False)
    a = [x for x in nltk.skipgrams(tokens, n, k)]
    for j in range(len(a)):
        for k in range(n):
            ss = sid.polarity_scores(a[j][k])
            if (ss["pos"] == 1):
                skip_gram_value += 1
            if (ss["neg"] == 1):
                skip_gram_value -= 1
    return skip_gram_value
示例#10
0
def rouge_s(summary: str, ref: str, beta: float = 1) -> float:
    """
    Computes the ROUGE-S score of a summary, ROUGE-2 score with skip-bigrams
    Args:
        summary: A `str` corresponding to a summary we want to evaluate.
        ref: A `str` corresponding to a reference summary we use to evaluate.
        beta: a `float` giving the importance of the precision in comparision to the recall.
    Returns:
        A `float` between 0 and 1 giving the ROUGE-S score.
    """
    vocab_summary = words(summary)
    vocab_ref = words(ref)
    summary_skip2 = set(skipgrams(vocab_summary, 2, len(vocab_summary)))
    ref_skip2 = set(skipgrams(vocab_ref, 2, len(vocab_ref)))
    if len(ref_skip2) == set() or summary_skip2 == set():
        return 0
    p = len(summary_skip2 & ref_skip2) / len(summary_skip2)
    r = len(summary_skip2 & ref_skip2) / len(ref_skip2)
    if r != 0 or p != 0:
        return ((1 + beta**2) * p * r) / (r + (beta**2 * p))
    else:
        return 0
def SkipTrigrams(tokenized_sentence):
	all_skip_bigrams = []
	final_list = []
	
	all_skip_bigrams.append(nltk.skipgrams(tokenized_sentence, 3, 0))

	#print(all_skip_bigrams[0])
	#print(list(all_skip_bigrams[0]))
	
	for tupl in list(all_skip_bigrams[0]):
		final_list.append(tupl[0]+' '+tupl[1]+' '+tupl[2])

	return final_list
def SkipBigramsSentence(x):
    all_skip_bigrams = []
    final_list = []
    #print(x)
    #print('xxx')
    all_skip_bigrams.append(list(nltk.skipgrams(x, 2, 5)))
    #print(all_skip_bigrams)
    for tupl in all_skip_bigrams[0]:
        #print('tuple')
        #print(tupl)
        final_list.append(tupl[0] + ' ' + tupl[1])

    return final_list
示例#13
0
    def get_sg(seqs, scores, min_n, max_n, skip):
        """
        Return all skipgrams of seqs and scores of length [min_n, max_n] with max number of skip tokens/=
        :param seqs: token sequences 2D list or similar
        :param scores: imp score sequences 2D list or similar
        :param min_n: minimum skipgram length
        :param max_n: max skipgram length (inclusive)
        :param skip: max number of tokens to skip
        :return: 2D list of skipgrams of seqs and scores (n_inst * n_sg)
        """
        cur_inst_sg_seqs, cur_inst_sg_scores = list(), list()
        for n in range(min_n, max_n + 1):
            if not n:
                continue
            if n == 1:
                cur_inst_sg_seqs.extend(seqs)
                cur_inst_sg_scores.extend(scores)
                continue

            cur_inst_sg_seqs.extend([' '.join(sg) for sg in skipgrams(seqs, n=n, k=skip)])
            cur_inst_sg_scores.extend([np.mean(sg) for sg in skipgrams(scores, n=n, k=skip)])

        return cur_inst_sg_seqs, cur_inst_sg_scores
def skip_grams(tokens, n, k):
    skip_gram_value = 0
    # tokens = clean_data('if it is well hidden', lemmatize= False)
    #a=[('if', 'it'), ('it', 'is'), ('is', 'well'), ('well', 'hidden')] for k=0
    #a= [('if', 'it'), ('if', 'is'), ('it', 'is'), ('it', 'well'), ('is', 'well'), ('is', 'hidden'), ('well', 'hidden')] for k=1
    #a = [('if', 'it'), ('if', 'is'), ('if', 'well'), ('it', 'is'), ('it', 'well'), ('it', 'hidden'), ('is', 'well'), ('is', 'hidden'), ('well', 'hidden')] for k=2
    a = [x for x in nltk.skipgrams(tokens, n, k)]
    for j in range(len(a)):
        for k in range(n):
            ss = sid.polarity_scores(a[j][k])
            if (ss["pos"] == 1):
                skip_gram_value += 1
            if (ss["neg"] == 1):
                skip_gram_value -= 1
    return skip_gram_value
def skip_grams(tokens, n, dist):
    skip_gram_value = 0
    pos_words = []
    neg_words = []
    arr = [x for x in nltk.skipgrams(tokens, n, dist)]
    for j in range(len(arr)):
        for k in range(n):
            ss = sia.polarity_scores(arr[j][k])
            if (ss["pos"] == 1):
                skip_gram_value += 1
                pos_words.append(arr[j][k])
            if (ss["neg"] == 1):
                skip_gram_value -= 1
                neg_words.append(arr[j][k])
    return pos_words, neg_words
示例#16
0
def get_n_skipgrams(tokens, n, k):
    """
        Fuction to extract the skip gram of a given list of tokens
        Args:
        -----
        tokens (list of strings) >>> list of tokens extracted by using the TextPreprocessing class.
        n (int) >>> number of gram.
        k (int) >>> skip param for the skip gram.
        
        Returns:
        --------
        a list of tuple that contains the skip gram.
        """

    sent = " ".join(tokens).split()
    return list(nltk.skipgrams(sent, n, k))
示例#17
0
def remove_html(raw_html):
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = cleantext.lower()
    cleantext = re.sub(' ', ' ', cleantext)
    cleantext = re.sub('•', ' ', cleantext)

    cleantext = re.sub(REPLACE_BY_SPACE_RE, " ", cleantext)
    cleantext = re.sub(BAD_SYMBOLS_RE, "", cleantext)

    cleantext = " ".join([
        word_lemma.lemmatize(w) for w in cleantext.split(" ")
        if w not in STOPWORDS
    ])
    #cleantext = " ".join([w for w in cleantext.split(" ") if w not in STOPWORDS])

    cleantext = cleantext + ' '.join([
        ' '.join(x) for x in
        (list(skipgrams(itertools.islice(cleantext.split(), 50), 3, 1)))
    ])

    return cleantext
from nltk import skipgrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import itertools

stopwords = list(stopwords.words('english'))
numbers = list(range(0, 100))
forumSpecificWords = ['>', '<', '?', '[', ']', '*'] + numbers
stopwords += forumSpecificWords

with open("shakespeare.txt", 'rb') as f:
    content = f.read().decode('utf-8')
    tokens = word_tokenize(content)
    withoutStopwords = [w for w in tokens if w not in stopwords]

    for w in tokens:
        if w not in stopwords:
            withoutStopwords.append(w.strip())

            print(
                "\n-----2,2:",
                list(skipgrams(itertools.islice(withoutStopwords, 100, 2, 2))))
            print(
                "\n-----2,3:",
                list(skipgrams(itertools.islice(withoutStopwords, 100, 2, 3))))
            print(
                "\n-----3,2:",
                list(skipgrams(itertools.islice(withoutStopwords, 100, 3, 2))))
示例#19
0
def main():
    # Start the counter
    start = time.time()

    # Load raw data and tokenize
    print("Loading data...")
    corpus_file = str(sys.argv[1])
    corpus = open(corpus_file, 'r')
    text = corpus.readlines()
    text = list(map(str.strip, text))
    text_string = ' '.join(text)
    print("Tokenizing...")
    tokens = nltk.word_tokenize(text_string)

    # Function to create a dataframe with counts and probabilities
    def create_count_df(list_to_count):
        list_with_counts = collections.Counter(list_to_count)
        df = pd.DataFrame()
        df['word'] = list_with_counts.keys()
        df['count'] = list_with_counts.values()
        df['prob'] = df['count'] / sum(df['count'])
        return df

    # Create the list of unigrams with the count and normalize probability
    print("Creating the list of unigrams...")
    unigram_df = create_count_df(tokens)
    print("Creating the list of skipgrams...")
    skipgram_list = list(skipgrams(tokens, 2, 2))
    skipgram_df = create_count_df(skipgram_list)
    print("# tokens: ", len(tokens))
    print("# unigrams: ", unigram_df.shape[0])
    print("# skipgrams: ", skipgram_df.shape[0])

    # For each pair of words calculate the PMI and create a data frame
    print("Calculating PMI values for each skipgram...")
    skipgram_df[['word1', 'word2']] = skipgram_df['word'].apply(pd.Series)
    unigram_df = unigram_df.set_index('word')
    skipgram_df['prob1'] = skipgram_df['word1'].map(unigram_df['prob'].get)
    skipgram_df['prob2'] = skipgram_df['word2'].map(unigram_df['prob'].get)
    skipgram_df['pmi'] = np.log(skipgram_df['prob'] / (skipgram_df['prob1'] * skipgram_df['prob2']))
    skipgram_df = skipgram_df[['word1', 'word2', 'pmi']]

    # Pivot the data frame into a sparse matrix, and convert NaNs into 0s
    print("Converting into a matrix...")
    pmi_matrix = skipgram_df.pivot(index='word1', columns='word2', values='pmi')
    pmi_matrix = pmi_matrix.fillna(0)

    # Apply SVD to reduce the size of the matrix to get word vectors
    print("Extracting word vectors...")
    U, S, V = scipy.sparse.linalg.svds(pmi_matrix, k=int(sys.argv[2]))
    word_list = unigram_df.index.get_values()

    # Save the model
    print("Saving model...")
    word_list_name = '_'.join([sys.argv[3], 'wordlist.p'])
    vectors_name = '_'.join([sys.argv[3], 'vectors.p'])
    output_word_list = open(word_list_name, 'wb')
    pickle.dump(word_list, output_word_list)
    output_word_list.close()
    output_vectors = open(vectors_name, 'wb')
    pickle.dump(U, output_vectors)
    output_vectors.close()

    # Print out overall statistics of the run
    end = time.time()
    print("Running time: ", str(round(end - start, 1)), "seconds")
    return
示例#20
0
文件: ngrams.py 项目: rafspiny/CliNER
def get_char_skip_grams(text, n, k):

    text = ' '.join(text)
    chars = [c for c in text]

    return set([''.join(t) for t in skipgrams(chars, n, k)])
示例#21
0
from nltk import skipgrams
import jieba
import csv
import re
import gc

f = open('D:/wiki_texts_data.txt', 'r', encoding='utf-8')

sent = re.sub("[A-Za-z0-9]", "", f.read())

sent_cut = jieba.cut(sent)
sent_list = []

for i in sent_cut:
    sent_list.append(i)

j = 0

for i in range(6):
    c = 2
    for k in range(7):
        with open(str(j) + '.csv', 'w', encoding="utf_8_sig",
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            skipsent = list(skipgrams(sent_list, c, k))
            writer.writerows(zip(skipsent))
            del skipsent
            gc.collect()
        j = j + 1
    c = c + 2
示例#22
0
 def skip_analyzer(self, doc):
     tokens = super().build_analyzer()(doc)
     if self.n <= 1:
         return nltk.ngrams(tokens, n=self.n)
     return nltk.skipgrams(tokens, self.n, self.k)
freq_path = "data/processed/freqs"

lyrics_files = [f for f in listdir(data_path) if isfile(join(data_path, f))]
for lyrics_file in lyrics_files:
    with open(join(data_path, lyrics_file), "r") as infile:
        text = infile.read().strip().split()

    wfreqs = Counter(text)
    wfreqs = {
        k: v
        for k, v in sorted(
            wfreqs.items(), key=lambda item: item[1], reverse=True)
        if v > 2 and k not in STOP_WORDS
    }

    skipgrams = Counter(list(nltk.skipgrams(text, 2, 3)))
    skipgrams = {
        k: v
        for k, v in sorted(
            skipgrams.items(), key=lambda item: item[1], reverse=True) if v > 2
    }

    text_graph = nx.Graph()
    for k, v in skipgrams.items():
        text_graph.add_edge(k[0], k[1], weight=v)
    pr = nx.pagerank(text_graph, weight='weight')
    pr = {
        k: v
        for k, v in sorted(pr.items(), key=lambda item: item[1], reverse=True)
        if k not in STOP_WORDS
    }
from bounter import bounter
from nltk import skipgrams


in_path = "data/seedonly"
txts = [f for f in listdir(in_path) if isfile(join(in_path, f))]

with open("data/external/stopwords.txt", "r") as f:
    stopwords = set(f.read().strip().split("\n"))

counts = bounter(size_mb=1024)
for txt in txts:
    with open(join(in_path, txt), "r") as f:
        text = f.read().split()
    text = [wd for wd in text if wd not in stopwords]
    skips = list(skipgrams(text, 2, 5))
    skips = [sorted(t) for t in skips]
    skips = ['@'.join(t) for t in skips]
    counts.update(skips)

G = nx.Graph()
i = 0
for skip, freq in counts.iteritems():
    if freq > 50:
        try:
            source, target = skip.split("@")
            G.add_edge(source, target, weight=freq)
            if source not in G:
                G.add_node(source)
            if target not in G:
                G.add_node(target)