def _iterate_multiplicity(self, ngrams_generator: iter) -> list(): ngrams = list() while True: try: ngrams.append(next(ngrams_generator)) except StopIteration: return ngrams
def find_keywords(self, titles): # ngram ngrams = [] for t in titles: ngrams.append(self.get_ngrams(t)) keywords = collections.Counter(chain.from_iterable(ngrams)) # mean_for_gt_2 = statistics.mean([itm for key, itm in keywords.items() if itm > 2]) mean_for_gt_2 = statistics.median([itm for key, itm in keywords.items()]) # filter the keywords that count over the mean greater than 2 new_keys = [] for k in keywords: if keywords.get(k) > mean_for_gt_2: new_keys.append(k) ner = {k: keywords[k] for k in new_keys} # filter the NER which is subset of others cleanned_ners = [] for k1, itm1 in ner.items(): # subset_of_others = False # for k2, itm2 in ner.items(): # if k1 == k2: # continue # elif k1 in k2: # subset_of_others = True # break # if not subset_of_others: cleanned_ners.append(k1) return set(cleanned_ners)
def get_word_bigrams(words): ngrams = [] for i in range(0, len(words)): if i > 0: ngrams.append("%s %s"%(words[i-1], words[i])) return ngrams
def ngrams(words, n): ngrams = [] d = collections.deque(maxlen=n) d.extend(words[:n]) words = words[n:] for window, word in zip(itertools.cycle((d, )), words): ngrams.append(' '.join(window)) d.append(word) return ngrams
def combineTokens(a): ngrams = [] for gram_data_frame in a: # print(gram_data_frame['NGram'].values) raw_value = gram_data_frame['NGram'].values token_values = [' '.join(token) for token in raw_value] ngrams.append(token_values) # print('1') return flatten(ngrams)
def process_text(text, key): processing_functions = col_custom[key] for fun in processing_functions[ 1:]: #looping through list of functions for the attribute(key) if len(fun) != 2: command = f"{fun}(text)" text = eval(command) #apply the cleaning functions on the text else: #for functions having 2 arguement according to the file 'data_preprocessing_functions.csv' command = f"{fun[0]}(text,fun[1])" text = eval(command) ngrams = [] ngram_len = [] for stri in text[0]: ngram_len.append(len(stri)) ngrams.append(" ".join(sorted(stri))) return ngrams, text[1], ngram_len
def add_ngram(q, n_gram_max): ngrams = [] for n in range(2, n_gram_max+1): for w_index in range(len(q)-n+1): ngrams.append('--'.join(q[w_index:w_index+n])) return q + ngrams
def to_ngrams(words, n): ngrams = [] for a in words: for b in range(0, len(a) - n + 1): ngrams.append(tuple(words[b:b + n])) return ngrams