def generateNGrams(sequence, n): ngrams = [] for i in range(len(sequence) - n + 1): ngrams.append(' '.join(tuple(sequence[i:i + n]))) #print(str(n)+"-Grams Generated\n") #print(ngrams[:5]) return ngrams
def create_ngram(n, texts,test=False): ''' input: list of sentence with word tokennize ''' # Clean text and add pseudo start and end code lst_text = [] for s in texts: s = [''.join(c for c in w if c not in string.punctuation) for w in s] # Remove punctuation s = [w.lower() for w in s if w] # Remove the empty strings s = ['<s>',] * (3 - 1) + s + ['</s>',] lst_text.extend(s) # For training text, include unknown words if not test: low_counter = [w for w in Counter(lst_text).keys() if Counter(lst_text)[w] < 2] for i, w in enumerate(lst_text): if w in low_counter : lst_text[i] = '<UNK>' # Build ngram ngrams = [] for i in range(len(lst_text) - n + 1): ngram = tuple(lst_text)[i:i+n] ngrams.append(ngram) return ngrams
def ngramify(clean, n): ngrams = [] for i in range(len(clean) - n + 1): grams = [] for j in range(n): grams.append(clean[i + j]) grams = tuple(grams) ngrams.append(grams) return ngrams
def get3Grams(payload_obj): '''Divides a string into 3-grams Example: input - payload: "<script>" output- ["<sc","scr","cri","rip","ipt","pt>"] ''' payload = str(payload_obj) ngrams = [] for i in range(0, len(payload) - 3): ngrams.append(payload[i:i + 3]) return ngrams
def get1Grams(payload_obj): '''Divides a string into 1-grams Example: input - payload: "<script>" output- ["<","s","c","r","i","p","t",">"] ''' payload = str(payload_obj) ngrams = [] for i in range(0, len(payload) - 1): ngrams.append(payload[i:i + 1]) return ngrams
def create_tfidf_vectors(dataframe): speeches = dataframe['concat_speeches'].tolist() ngrams = [] for unit in speeches: ngrams.append(compute_ngrams(unit, 2)) ngrams_to_add = pd.Series(ngrams) dataframe['ngrams'] = ngrams_to_add.values tfidf = [] for element in ngrams: tfidf.append(compute_tfidf(element, num_speeches, doc_freq)) tfidf_to_add = pd.Series(tfidf) dataframe['tfidf'] = tfidf_to_add.values return dataframe
def generate_ngrams(s, n): # konverzija u mala slova s = s.lower() # zamena svih ne-alfanumeričkih znakova razmacima # važno je uključiti specifična slova srpskog alfabeta poput ž, š i sličih s = re.sub(r'[^a-zA-Z0-9ćčžšđ\s]', ' ', s) # razbijanje rečenica na tokene, bez praznih tokena tokens = [token for token in s.split(" ") if token != ""] # prolaskom kroz ulazni niz se generišu n-gramovi # potrebno je i nadovezati razbijene tokene offset = 0 ngrams = [] while (offset <= len(tokens)): length = offset % n + n ngrams.append(tokens[offset:(offset + length)]) offset = offset + 1 if (offset + length == len(tokens) + 1): break return ngrams
def calc_n_grams(data_line, n): ngrams = [] #create pos-tagged tokens wrds = nltk.pos_tag(nltk.word_tokenize(data_line)) #punctuation we want to delete del_tk = {'LS', 'SYM', 'SENT', '#', '$', '"', '``', '{', '}', '(', ')', ',', ';', '.', ':', '\'\''} #go through and remove punctuation in tokens pop_ind = [] #collect indexes to delete for i in list(range(0,(len(wrds))-1)): if wrds[i][1] in del_tk: pop_ind.insert(0,i) #pop the undesired punctuation off the list for i in pop_ind: del(wrds[i]) if (n <= len(wrds)): #add start and end tokens wrds.insert(0,("START","SENT_TOKEN")) wrds.append(("START","SENT_TOKEN")) for i in list(range(0,(len(wrds)-n+1))): ng = () for j in list(range(0,n)): temp = tuple(wrds[i+j]) ng = ng + temp ngrams.append(ng) return ngrams
def generateNGrams(sequence, n): ngrams = [] for i in range(len(sequence) - n + 1): ngrams.append(' '.join(tuple(sequence[i:i + n]))) return ngrams
def freq_pval_finder(X, vectorizer, gram, alternative): X_df = pd.DataFrame(X) zeros =[] ones = [] for i in X_df: counter = 0 # print(X_df[i].shape) # break zero_filter = [] ones_filter = [] while counter < X_df[i].shape[0]: # print(counter) if X_df[i][counter] == 1: ones_filter.append(counter) if X_df[i][counter] == 0: zero_filter.append(counter) # print(counter) # counter = 0 # while counter < i[0]: counter += 1 zeros.append(zero_filter) ones.append(ones_filter) success = [] # word is present failure = [] # word is absent for i, j in zip(ones, zeros): each_success = [] each_failure = [] for k in i: each_success.append(Y_transpose[0][k]) for l in j: each_failure.append(Y_transpose[0][l]) success.append(each_success) failure.append(each_failure) success_prop = [] failure_prop = [] for i,j in zip(success, failure): success_prop.append(sum(i)/ len(i)) #gives the proportion of spams when the word is present failure_prop.append(sum(j)/len(j)) #gives the proportion of spams when the word is not present P_VAL = [] for i, j in zip(success, failure_prop): p_val = binom_test(sum(i), len(i), j, alternative = alternative) P_VAL.append(p_val) word_keys = sorted(vectorizer.vocabulary_.items(), key = lambda x : x[1]) def word_to_ngrams(words, n): return [' '.join(words[i: i + n]) for i in range(len(words) - n + 1)] tokens = [nltk.word_tokenize(i) for i in corpus] ngrams = [] for i in tokens: ngrams.append(word_to_ngrams(i, gram)) word_to_grams = list(chain(*ngrams)) fdist = nltk.FreqDist(word_to_grams) pval = [] c = 0 while c < len(P_VAL): pval.append((word_keys[c][0], P_VAL[c])) c += 1 sorted_pval = sorted(pval, key= lambda x: x[1]) word_freq_pval = [] excluded_words = [] for i in sorted_pval: flag = 0 for j in fdist.most_common(1000): if i[0] == j[0]: word_freq_pval.append((i[0], i[1], j[1])) flag = 1 if flag == 0: excluded_words.append((i[0], i[1])) for i in excluded_words: word_freq_pval.append((i[0], i[1],fdist[i[0]])) word_freq_pval = sorted(word_freq_pval, key = lambda x: x[1]) filtered_freq_pval = [] for i in word_freq_pval: if i[1] <= 0.02 or i[1] >= 0.8: filtered_freq_pval.append(i) df_filtered_freq_pval = pd.DataFrame(filtered_freq_pval, columns= ['Words', 'P-Value', 'Frequency']) return df_filtered_freq_pval