def get_wikifrequencies(candidate_keywords):
    """
    Return normalized word frequency for each keyword in Wikipedia
    """
    max_frequency = wikiwords.freq('the')
    return [
        wikiwords.freq(w) / float(max_frequency) for w in candidate_keywords
    ]
示例#2
0
def compute_features(d_dict, q_dict, c_dict):
    # in_q, in_c, lemma_in_q, lemma_in_c, tf
    q_words_set = set([w.lower() for w in q_dict['words']])
    in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']]
    c_words_set = set([w.lower() for w in c_dict['words']])
    in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']]

    q_words_set = set([w.lower() for w in q_dict['lemma']])
    lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']]
    c_words_set = set([w.lower() for w in c_dict['lemma']])
    lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']]

    tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words']]
    tf = [float('%.2f' % v) for v in tf]
    d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words']))
    from conceptnet import concept_net
    p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words'])
    p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words'])
    assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(lemma_in_c) == len(in_q) and len(tf) == len(in_q)
    assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation)
    return {
        'in_q': in_q,
        'in_c': in_c,
        'lemma_in_q': lemma_in_q,
        'lemma_in_c': lemma_in_c,
        'tf': tf,
        'p_q_relation': p_q_relation,
        'p_c_relation': p_c_relation
    }
def wikifrequncy(file):
    list = file.split()
    dict = {}
    for i in range(len(list)):
        word = list[i]
        if not word in dict:
            dict[word] = wikiwords.freq(word)
    return dict
示例#4
0
def get_term_frequency(word: Union[str, Token]) -> float:
    """
    Returns the Term Frequency of word in the Wikipedia corpus. Calculated
    as:

        tf_w = log(1 + f_w)

    Where `f_w` is the number of occurrences of the word in the corpus.
    """
    if isinstance(word, Token):
        word = word.text
    # I'd like to use wikiwords.occ instead of this, but it's broken.
    # So I compute the occurence and N * freq (since freq = occ/N).
    occurrences = wikiwords.N * wikiwords.freq(word)
    return math.log(1 + occurrences)
示例#5
0
def get_tfidf(sentence):
    """
    calculate the weight of each word in the sentence by pretrained tfidf
    sentence - a list of string type
    return a list of scaler
    """
    tfidf_ = []
    for idx in range(len(sentence)):
        w = sentence[idx]
        try:
            tfidf = 0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) +
                                   10)
            tfidf = float('%.2f' % tfidf)
        except:
            logger.warning('{} - Failed to get to tfidf'.format(w.lower))
            tfidf = 0.0
        tfidf_.append(tfidf)
    return tfidf_
示例#6
0
def compute_features(q_dict, c_dict):
    # in_c, lemma_in_c, tf
    c_words_set = set([w.lower() for w in c_dict['words']])
    in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in q_dict['words']
    ]

    c_words_set = set([w.lower() for w in c_dict['lemma']])
    lemma_in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in q_dict['lemma']
    ]

    # tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']]
    tf = [wikiwords.freq(w.lower()) for w in q_dict['words']]
    # tf = [float('%.2f' % v) for v in tf]

    q_words = Counter(
        filter(lambda w: not is_stopword(w) and not is_punc(w),
               q_dict['words']))
    from conceptnet import concept_net
    q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words'])
    assert len(lemma_in_c) == len(in_c) and len(tf) == len(in_c)
    assert len(tf) == len(q_c_relation)

    q_is_science_term = [is_science_term(w) for w in q_dict['words']]
    q_is_cand = [
        1 if not is_punc(w) and not is_stopword(w) else 0
        for w in q_dict['words']
    ]

    return {
        'in_c': in_c,
        'lemma_in_c': lemma_in_c,
        'tf': tf,
        'q_c_relation': q_c_relation,
        'q_is_science_term': q_is_science_term,
        'q_is_cand': q_is_cand
    }
def get_likelihood_of_string(string, avg_frequency=False):
    """
    Finds individual words in string of multiple words that is 
    missing spaces between words. Does so by reducing string from the back
    and checking if resulting string is a word in English dictionary.
    
    In:
        string (str): string of words separated by spaces
        avg_frequency (bool): boolean whether or not to calculate avg frequency of words 
    Out:
        sum_frequency (float): likelihood of string
    """

    sum_frequency = 0
    list_of_words = string.split(" ")

    for word in list_of_words:
        sum_frequency += wikiwords.freq(word.lower())
    if avg_frequency:
        sum_frequency = sum_frequency / len(list_of_words)

    return sum_frequency
示例#8
0
def compute_features(p_dict, q_dict, c_dict):
    # p_in_q, p_in_c, lemma_p_in_q, lemma_p_in_c, tf


    p_words_set = set([w.lower() for w in p_dict['words']])
    q_words_set = set([w.lower() for w in q_dict['words']])
    c_words_set = set([w.lower() for w in c_dict['words']])

    p_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']]
    p_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']]

    q_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']]
    q_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']]

    c_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']]
    c_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']]


    p_words_set = set([w.lower() for w in p_dict['lemma']])
    q_words_set = set([w.lower() for w in q_dict['lemma']])
    c_words_set = set([w.lower() for w in c_dict['lemma']])
    p_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']]
    p_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']]
 
    q_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']]
    q_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']]

    c_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']]
    c_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']]

    p_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in p_dict['words']]
    p_tf = [float('%.2f' % v) for v in p_tf]
    q_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']]
    q_tf = [float('%.2f' % v) for v in q_tf]
    c_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in c_dict['words']]
    c_tf = [float('%.2f' % v) for v in c_tf]
    d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), p_dict['words']))
    
    from conceptnet import concept_net
    p_q_relation = concept_net.p_q_relation(p_dict['words'], q_dict['words'])
    p_c_relation = concept_net.p_q_relation(p_dict['words'], c_dict['words'])

    q_p_relation = concept_net.p_q_relation(q_dict['words'], p_dict['words'])
    q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words'])

    c_p_relation = concept_net.p_q_relation(c_dict['words'], p_dict['words'])
    c_q_relation = concept_net.p_q_relation(c_dict['words'], q_dict['words'])
 

    assert len(p_tf) == len(p_q_relation) and len(p_tf) == len(p_c_relation)
    assert len(q_tf) == len(q_p_relation) and len(q_tf) == len(q_c_relation)
    assert len(c_tf) == len(c_p_relation) and len(c_tf) == len(c_q_relation)


    return {
        'p_in_q': p_in_q,
        'p_in_c': p_in_c,
        'p_lemma_in_q': p_lemma_in_q,
        'p_lemma_in_c': p_lemma_in_c,
        'p_tf': p_tf,
        'p_q_relation': p_q_relation,
        'p_c_relation': p_c_relation,

        'q_in_p': q_in_p,
        'q_in_c': q_in_c,
        'q_lemma_in_p': q_lemma_in_p,
        'q_lemma_in_c': q_lemma_in_c,
        'q_tf': q_tf,
        'q_p_relation': q_p_relation,
        'q_c_relation': q_c_relation,

        'c_in_p': c_in_p,
        'c_in_q': c_in_q,
        'c_lemma_in_p': c_lemma_in_p,
        'c_lemma_in_q': c_lemma_in_q,
        'c_tf': c_tf,

        'c_p_relation': c_p_relation,
        'c_q_relation': c_q_relation,

    }
示例#9
0
    def screen_show(self, num, answer):

        if self.mode == "1":

            os.system("cls")

            print("%s\n\n" % self.vacab["Word"][num])

            print("1.%s\n" % self.vacab["Chinese"][answer["1"]])
            print("2.%s\n" % self.vacab["Chinese"][answer["2"]])
            print("3.%s\n" % self.vacab["Chinese"][answer["3"]])
            print("4.%s\n" % self.vacab["Chinese"][answer["4"]])

        if self.mode == "2":

            os.system("cls")

            print("Question No.%s\n\n" % (self._counter - self._start))

            print("%s\n\n" % self.vacab["Word"][num])

            print("%s\n\n" % self.vacab["Chinese"][answer])

            print("1.Easy 2.Hard 3. Hell q.Quit\n")

            if self._counter >= self._end:

                print("You can finish the study now!\n")

        if self.mode == "3":

            os.system("cls")

            print("Question No.%s\n\n" % (self._counter - self._start))

            print("%s\n\n" % self.vacab["Word"][num])

            try:

                statistic = (wikiwords.freq(self.vacab["Word"][num]),
                             wikiwords.occ(self.vacab["Word"][num]))

            except Exception as e:

                statistic = (0, 0)

            print("Freq:%-10.2eOcc:%-10.2e\n" % statistic)

            #print("%s\n\n" % self.vacab["Chinese"][answer])

            print("1.Easy 2.Hard 3. Hell q.Quit\n")

            if self._counter >= self._end:

                print("You can finish the study now!\n")

        if self.mode == "4":

            os.system("cls")

            print("Question No.%s\n\n" % (self._counter - self._start))

            print("%s\n\n" % self.vacab["Word"][num])

            try:

                statistic = (wikiwords.freq(self.vacab["Word"][num]),
                             wikiwords.occ(self.vacab["Word"][num]))

            except Exception as e:

                statistic = (0, 0)

            print("Freq:%-10.2eOcc:%-10.2e\n" % statistic)

            #print("%s\n\n" % self.vacab["Chinese"][answer])

            print("1.Easy 2.Hard 3. Hell q.Quit\n")
示例#10
0
def compute_features(d_dicts, q_dict, c_dicts, q_terms):
    # compute features for each d_dict and c_dict
    in_qs, in_cs, lemma_in_qs, lemma_in_cs = [], [], [], []
    p_q_relations, p_c_relations = [], []
    tfs = []

    for d_dict, c_dict in zip(d_dicts, c_dicts):
        # in_q, in_c, lemma_in_q, lemma_in_c, tf
        q_words_set = set([w.lower() for w in q_dict['words']])
        in_q = [
            int(w.lower() in q_words_set and not is_stopword(w)
                and not is_punc(w)) for w in d_dict['words']
        ]
        in_qs.append(in_q)
        q_words_set = set([w.lower() for w in q_dict['lemma']])
        lemma_in_q = [
            int(w.lower() in q_words_set and not is_stopword(w)
                and not is_punc(w)) for w in d_dict['lemma']
        ]
        lemma_in_qs.append(lemma_in_q)

        c_words_set = set([w.lower() for w in c_dict['words']])
        in_c = [
            int(w.lower() in c_words_set and not is_stopword(w)
                and not is_punc(w)) for w in d_dict['words']
        ]
        in_cs.append(in_c)
        c_words_set = set([w.lower() for w in c_dict['lemma']])
        lemma_in_c = [
            int(w.lower() in c_words_set and not is_stopword(w)
                and not is_punc(w)) for w in d_dict['lemma']
        ]
        lemma_in_cs.append(lemma_in_c)

        tf = [
            0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10)
            for w in d_dict['words']
        ]
        tf = [float('%.2f' % v) for v in tf]
        tfs.append(tf)
        #d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words']))

        from conceptnet import concept_net
        p_q_relation = concept_net.p_q_relation(d_dict['words'],
                                                q_dict['words'])
        p_q_relations.append(p_q_relation)
        p_c_relation = concept_net.p_q_relation(d_dict['words'],
                                                c_dict['words'])
        p_c_relations.append(p_c_relation)

        assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(
            lemma_in_c) == len(in_q) and len(tf) == len(in_q)
        assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation)

    if q_terms is not None:
        q_es = [True if w in q_terms else False for w in q_dict['words']]
    else:
        q_es = None

    # update in_c, lemma_in_c and p_c_relation
    return {
        'in_qs': in_qs,
        'in_cs': in_cs,
        'lemma_in_qs': lemma_in_qs,
        'lemma_in_cs': lemma_in_cs,
        'tfs': tfs,
        'p_q_relations': p_q_relations,
        'p_c_relations': p_c_relations,
        'q_es': q_es
    }
示例#11
0
def idf_wiki(token):
    """Computed IDF score based on lookup table of frequency based on Wikipedia corpus"""
    if wikiwords.freq(token) == 0:
        return math.log(wikiwords.N)
    else:
        return math.log(wikiwords.freq(token))
示例#12
0
    def add_post(self, text, metainfo):

        """
        Extract the frequency of words of text and create a Post
        TODO extract related words
        Args:
            text (str): main text or the post
            metainfo (dictionary): title, number of likes and shares, etc
        """
        text = text.encode('ascii','ignore')

        #sentimental analysis
        t = TextBlob(text)
        metainfo["polarity"] = t.sentiment.polarity
        metainfo["subjectivity"] = t.sentiment.subjectivity

        vader = vaderSentiment(text)
        metainfo["vader"] = vader

        text = text.translate(string.maketrans("",""), string.punctuation)

        #removing stop words
        stop = stopwords.words('english')

        #frequency in english language
        english_freq = {}
        reverse_stem = {}
        list_words = []
        for i in text.split():
            i = i.lower()
            if i not in stop:
                freq = wikiwords.freq(i, lambda x: 0.000001)
                st = self.stemmer.stem(i)
                if not reverse_stem.has_key(st):
                    reverse_stem[st] = i
                if english_freq.has_key(st):english_freq[st] += freq
                else: english_freq[st] = freq
                list_words.append(st)

        #get frequencies of words
        frequencies = FreqDist(list_words)
        main_words = []
        for word, count in frequencies.items():
            english_freq[word] /= count
            #print reverse_stem[word] + ": " + str(english_freq[word])
            tf_idf = (-1.0)*log(count+0.1)/(log(english_freq[word]))
            main_words.append([tf_idf, count, reverse_stem[word], word])

        #select just most important words
        main_words.sort(reverse=True)
        NUM_MAX = 100
        if len(main_words) > NUM_MAX:
            main_words = main_words[0:NUM_MAX]

        #create dict
        final_main_words = {}
        for w in main_words:
            final_main_words[w[3]] = w[0:3]

        self.last_id_added += 1
        id = self.last_id_added

        post = Post(id, self.name, metainfo, frequencies, final_main_words, reverse_stem)
        post.save()
        self.update_index(list_words, frequencies)

        self.save()

        return post
示例#13
0
文件: segment.py 项目: iitis/dnsclass
def Pw(word):
	return wikiwords.freq(word, avoid_long_words)
示例#14
0
文件: utils.py 项目: StarWang/nn4nlp
import wikiwords
import unicodedata
import numpy as np

from collections import Counter
from nltk.corpus import stopwords

words = frozenset(stopwords.words('english'))
punc = frozenset(string.punctuation)
def is_stopword(w):
    return w.lower() in words

def is_punc(c):
    return c in punc

baseline = wikiwords.freq('the')
def get_idf(w):
    return np.log(baseline / (wikiwords.freq(w.lower()) + 1e-10))

def load_data(path, scriptKnowledge, use_script_knowledge, use_char_emb):
    from doc import Example
    data = []
    for line in open(path, 'r', encoding='utf-8'):
        if path.find('race') < 0 or np.random.random() < 0.6:
            data.append(Example(json.loads(line), scriptKnowledge, use_script_knowledge, use_char_emb))
    print('Load %d examples from %s...' % (len(data), path))
    return data

class Dictionary(object):
    NULL = '<NULL>'
    UNK = '<UNK>'
示例#15
0
def get_wikifrequencies(candidate_keywords):
  """
  Return normalized word frequency for each keyword in Wikipedia
  """
  max_frequency = wikiwords.freq('the')
  return [wikiwords.freq(w)/float(max_frequency) for w in candidate_keywords]
示例#16
0
文件: utils.py 项目: StarWang/nn4nlp
def get_idf(w):
    return np.log(baseline / (wikiwords.freq(w.lower()) + 1e-10))
示例#17
0
def compute_features(d_dict, q_dict, c_dict, d_id, q_id, c_id, graphs,
                     sentence_graphs):
    # in_q, in_c, lemma_in_q, lemma_in_c, tf
    q_words_set = set([w.lower() for w in q_dict['words']])
    in_q = [
        int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['words']
    ]
    c_words_set = set([w.lower() for w in c_dict['words']])
    in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['words']
    ]

    q_words_set = set([w.lower() for w in q_dict['lemma']])
    lemma_in_q = [
        int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['lemma']
    ]
    c_words_set = set([w.lower() for w in c_dict['lemma']])
    lemma_in_c = [
        int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w))
        for w in d_dict['lemma']
    ]

    tf = [
        0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10)
        for w in d_dict['words']
    ]
    tf = [float('%.2f' % v) for v in tf]
    d_words = Counter(
        filter(lambda w: not is_stopword(w) and not is_punc(w),
               d_dict['words']))
    four_lang_utils = Utils()
    p_q_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils,
                                                    d_dict, q_dict)
    p_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils,
                                                    d_dict, c_dict)
    q_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils,
                                                    q_dict, c_dict)
    p_q_four_lang_sentence_relation =\
        compute_4lang_sentence_relation(sentence_graphs[d_id],
                                        sentence_graphs[d_id]["questions"][q_id], four_lang_utils)
    p_c_four_lang_sentence_relation =\
        compute_4lang_sentence_relation(sentence_graphs[d_id],
                                        sentence_graphs[d_id]["questions"][q_id]["choice"][c_id],
                                        four_lang_utils)
    q_c_four_lang_sentence_relation =\
        compute_4lang_sentence_relation(sentence_graphs[d_id]["questions"][q_id],
                                        sentence_graphs[d_id]["questions"][q_id]["choice"][c_id],
                                        four_lang_utils)
    from conceptnet import concept_net
    p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words'])
    p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words'])
    assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(
        lemma_in_c) == len(in_q) and len(tf) == len(in_q)
    assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation)
    return {
        'in_q': in_q,
        'in_c': in_c,
        'lemma_in_q': lemma_in_q,
        'lemma_in_c': lemma_in_c,
        'tf': tf,
        'p_q_relation': p_q_relation,
        'p_c_relation': p_c_relation,
        'p_q_four_lang_relation': p_q_four_lang_relation,
        'p_c_four_lang_relation': p_c_four_lang_relation,
        'q_c_four_lang_relation': q_c_four_lang_relation,
        'p_q_four_lang_sentence_relation': p_q_four_lang_sentence_relation,
        'p_c_four_lang_sentence_relation': p_c_four_lang_sentence_relation,
        'q_c_four_lang_sentence_relation': q_c_four_lang_sentence_relation
    }