示例#1
0
文件: ngram.py 项目: mciszczon/ztp
 def _iterate_multiplicity(self, ngrams_generator: iter) -> list():
     ngrams = list()
     while True:
         try:
             ngrams.append(next(ngrams_generator))
         except StopIteration:
             return ngrams
    def find_keywords(self, titles):
        # ngram
        ngrams = []
        for t in titles:
            ngrams.append(self.get_ngrams(t))

        keywords = collections.Counter(chain.from_iterable(ngrams))
        #     mean_for_gt_2 = statistics.mean([itm for key, itm in keywords.items() if itm > 2])
        mean_for_gt_2 = statistics.median([itm for key, itm in keywords.items()])

        # filter the keywords that count over the mean greater than 2
        new_keys = []

        for k in keywords:
            if keywords.get(k) > mean_for_gt_2:
                new_keys.append(k)

        ner = {k: keywords[k] for k in new_keys}

        # filter the NER which is subset of others
        cleanned_ners = []

        for k1, itm1 in ner.items():
            #         subset_of_others = False
            #         for k2, itm2 in ner.items():
            #             if k1 == k2:
            #                 continue
            #             elif k1 in k2:
            #                 subset_of_others = True
            #                 break
            #         if not subset_of_others:
            cleanned_ners.append(k1)
        return set(cleanned_ners)
示例#3
0
def get_word_bigrams(words):

    ngrams = []

    for i in range(0, len(words)):
        if i > 0:
            ngrams.append("%s %s"%(words[i-1], words[i]))

    return ngrams
示例#4
0
 def ngrams(words, n):
     ngrams = []
     d = collections.deque(maxlen=n)
     d.extend(words[:n])
     words = words[n:]
     for window, word in zip(itertools.cycle((d, )), words):
         ngrams.append(' '.join(window))
         d.append(word)
     return ngrams
def combineTokens(a):
    ngrams = []
    for gram_data_frame in a:
        #        print(gram_data_frame['NGram'].values)
        raw_value = gram_data_frame['NGram'].values
        token_values = [' '.join(token) for token in raw_value]
        ngrams.append(token_values)


#        print('1')
    return flatten(ngrams)
def process_text(text, key):
    processing_functions = col_custom[key]
    for fun in processing_functions[
            1:]:  #looping through list of functions for the attribute(key)
        if len(fun) != 2:
            command = f"{fun}(text)"
            text = eval(command)  #apply the cleaning functions on the text
        else:  #for functions having 2 arguement according to the file 'data_preprocessing_functions.csv'
            command = f"{fun[0]}(text,fun[1])"
            text = eval(command)
    ngrams = []
    ngram_len = []
    for stri in text[0]:
        ngram_len.append(len(stri))
        ngrams.append(" ".join(sorted(stri)))
    return ngrams, text[1], ngram_len
示例#7
0
def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
def to_ngrams(words, n):
    ngrams = []
    for a in words:
        for b in range(0, len(a) - n + 1):
            ngrams.append(tuple(words[b:b + n]))
    return ngrams