예제 #1
0
def generateNGrams(sequence, n):
    ngrams = []
    for i in range(len(sequence) - n + 1):
        ngrams.append(' '.join(tuple(sequence[i:i + n])))
    #print(str(n)+"-Grams Generated\n")
    #print(ngrams[:5])
    return ngrams
예제 #2
0
def create_ngram(n, texts,test=False):
    '''
    input: list of sentence with word tokennize
    '''
    # Clean text and add pseudo start and end code
    lst_text = []
    for s in texts:
        s = [''.join(c for c in w if c not in string.punctuation) for w in s] # Remove punctuation
        s = [w.lower() for w in s if w] # Remove the empty strings
        s = ['<s>',] * (3 - 1) + s + ['</s>',]
        lst_text.extend(s)
    
    # For training text, include unknown words
    if not test:
        low_counter = [w for w in Counter(lst_text).keys() if Counter(lst_text)[w] < 2]
        for i, w in enumerate(lst_text):
            if w in low_counter :
                lst_text[i] = '<UNK>'

    # Build ngram
    ngrams = []
    for i in range(len(lst_text) - n + 1):
        ngram = tuple(lst_text)[i:i+n]
        ngrams.append(ngram)
    
    return ngrams
def ngramify(clean, n):
    ngrams = []
    for i in range(len(clean) - n + 1):
        grams = []
        for j in range(n):
            grams.append(clean[i + j])
        grams = tuple(grams)
        ngrams.append(grams)
    return ngrams
예제 #4
0
def get3Grams(payload_obj):
    '''Divides a string into 3-grams
    
    Example: input - payload: "<script>"
             output- ["<sc","scr","cri","rip","ipt","pt>"]
    '''
    payload = str(payload_obj)
    ngrams = []
    for i in range(0, len(payload) - 3):
        ngrams.append(payload[i:i + 3])
    return ngrams
예제 #5
0
def get1Grams(payload_obj):
    '''Divides a string into 1-grams
    
    Example: input - payload: "<script>"
             output- ["<","s","c","r","i","p","t",">"]
    '''
    payload = str(payload_obj)
    ngrams = []
    for i in range(0, len(payload) - 1):
        ngrams.append(payload[i:i + 1])
    return ngrams
def create_tfidf_vectors(dataframe):
	speeches = dataframe['concat_speeches'].tolist()
	ngrams = []
	for unit in speeches:
		ngrams.append(compute_ngrams(unit, 2))
	ngrams_to_add = pd.Series(ngrams)
	dataframe['ngrams'] = ngrams_to_add.values
	tfidf = []
	for element in ngrams:
		tfidf.append(compute_tfidf(element, num_speeches, doc_freq))
	tfidf_to_add = pd.Series(tfidf)
	dataframe['tfidf'] = tfidf_to_add.values
	return dataframe
def generate_ngrams(s, n):
    # konverzija u mala slova
    s = s.lower()

    # zamena svih ne-alfanumeričkih znakova razmacima
    # važno je uključiti specifična slova srpskog alfabeta poput ž, š i sličih
    s = re.sub(r'[^a-zA-Z0-9ćčžšđ\s]', ' ', s)

    # razbijanje rečenica na tokene, bez praznih tokena
    tokens = [token for token in s.split(" ") if token != ""]

    # prolaskom kroz ulazni niz se generišu n-gramovi
    # potrebno je i nadovezati razbijene tokene
    offset = 0
    ngrams = []
    while (offset <= len(tokens)):
        length = offset % n + n
        ngrams.append(tokens[offset:(offset + length)])
        offset = offset + 1
        if (offset + length == len(tokens) + 1):
            break

    return ngrams
예제 #8
0
def calc_n_grams(data_line, n):

	ngrams = []
	
	#create pos-tagged tokens
	wrds = nltk.pos_tag(nltk.word_tokenize(data_line))
	
	#punctuation we want to delete
	del_tk = {'LS', 'SYM', 'SENT', '#', '$', '"', '``', '{', '}', '(', ')', ',', ';', '.', ':', '\'\''}
	
	#go through and remove punctuation in tokens
	pop_ind = []
	
	#collect indexes to delete
	for i in list(range(0,(len(wrds))-1)):
		if wrds[i][1] in del_tk:
			pop_ind.insert(0,i)
			
	#pop the undesired punctuation off the list
	for i in pop_ind:
		del(wrds[i])
	
	if (n <= len(wrds)):
		#add start and end tokens
		wrds.insert(0,("START","SENT_TOKEN"))
		wrds.append(("START","SENT_TOKEN"))

		for i in list(range(0,(len(wrds)-n+1))):
			ng = ()
			for j in list(range(0,n)):
				temp = tuple(wrds[i+j])
				ng = ng + temp
		
			ngrams.append(ng)
			
	return ngrams
예제 #9
0
def generateNGrams(sequence, n):
    ngrams = []
    for i in range(len(sequence) - n + 1):
        ngrams.append(' '.join(tuple(sequence[i:i + n])))
    return ngrams
예제 #10
0
def freq_pval_finder(X, vectorizer, gram, alternative):
    X_df = pd.DataFrame(X)
    zeros =[]
    ones = []

    for i in X_df:
        counter = 0
        # print(X_df[i].shape)
        # break
        zero_filter = []
        ones_filter = []
        while counter < X_df[i].shape[0]:
            # print(counter)
            if X_df[i][counter] == 1:
                ones_filter.append(counter)
            if X_df[i][counter] == 0:
                zero_filter.append(counter)

        #     print(counter)
        # counter = 0
        # while counter < i[0]:
            counter += 1
        zeros.append(zero_filter)
        ones.append(ones_filter)

    success = [] # word is present
    failure = [] # word is absent

    for i, j in zip(ones, zeros):
        each_success = []
        each_failure = []
        for k in i:
            each_success.append(Y_transpose[0][k])
        for l in j:
            each_failure.append(Y_transpose[0][l])

        success.append(each_success)
        failure.append(each_failure)


    success_prop = []
    failure_prop = []

    for i,j in zip(success, failure):
        success_prop.append(sum(i)/ len(i)) #gives the proportion of spams when the word is present
        failure_prop.append(sum(j)/len(j)) #gives the proportion of spams when the word is not present

    P_VAL = []
    for i, j in zip(success, failure_prop):
        p_val = binom_test(sum(i), len(i), j, alternative = alternative)
        P_VAL.append(p_val)


    word_keys = sorted(vectorizer.vocabulary_.items(), key = lambda x : x[1])


    def word_to_ngrams(words, n):
        return [' '.join(words[i: i + n]) for i in range(len(words) - n + 1)]

    tokens = [nltk.word_tokenize(i) for i in corpus]

    ngrams = []
    for i in tokens:
        ngrams.append(word_to_ngrams(i, gram))

    word_to_grams = list(chain(*ngrams))


    fdist = nltk.FreqDist(word_to_grams)

    pval = []
    c = 0
    while c < len(P_VAL):
        pval.append((word_keys[c][0], P_VAL[c]))
        c += 1

    sorted_pval = sorted(pval, key= lambda x: x[1])

    word_freq_pval = []
    excluded_words = []
    for i in sorted_pval:
        flag = 0
        for j in fdist.most_common(1000):
            if i[0] == j[0]:
                word_freq_pval.append((i[0], i[1], j[1]))
                flag = 1
        if flag == 0:
            excluded_words.append((i[0], i[1]))

    for i in excluded_words:
        word_freq_pval.append((i[0], i[1],fdist[i[0]]))

    word_freq_pval = sorted(word_freq_pval, key = lambda x: x[1])

    filtered_freq_pval = []
    for i in word_freq_pval:
        if i[1] <= 0.02 or i[1] >= 0.8:
            filtered_freq_pval.append(i)

    df_filtered_freq_pval = pd.DataFrame(filtered_freq_pval, columns= ['Words', 'P-Value', 'Frequency'])
    return df_filtered_freq_pval