Exemplos de NGram.intersection em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: ngram

Classe / Tipo: NGram

Método / Função: intersection

Exemplos em hotexamples.com: 1

NGram.intersection em Python - 1 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de ngram.NGram.intersection em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

NGram(30)

compare(30)

search(14)

add(9)

split(6)

find(5)

ngrams(4)

next_word(3)

clean(2)

train(2)

prob(2)

intersection_update(2)

print_attrib(2)

train_model(1)

trigrams(1)

unigrams(1)

save(1)

remove(1)

pad(1)

items_sharing_ngrams(1)

ngram_similarity(1)

load(1)

_split(1)

intersection(1)

get(1)

generate_token(1)

generate(1)

display_stats(1)

count(1)

classify(1)

bigrams(1)

append(1)

update(1)

Métodos Frequentes

NGram (30)

compare (30)

search (14)

add (9)

split (6)

find (5)

ngrams (4)

next_word (3)

clean (2)

train (2)

Métodos Frequentes

prob (2)

intersection_update (2)

print_attrib (2)

train_model (1)

trigrams (1)

unigrams (1)

save (1)

remove (1)

pad (1)

items_sharing_ngrams (1)

ngram_similarity (1)

load (1)

_split (1)

intersection (1)

get (1)

generate_token (1)

generate (1)

display_stats (1)

count (1)

classify (1)

Métodos Frequentes

ngram_similarity (1)

load (1)

_split (1)

intersection (1)

get (1)

generate_token (1)

generate (1)

display_stats (1)

count (1)

classify (1)

bigrams (1)

append (1)

update (1)

Métodos Frequentes

bigrams (1)

append (1)

update (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: cooccurence_distinct_ngram.py Projeto: sohleyer/Altegrad

def generate_cooccurence_distinct_ngram(path, n=2): """ Generate n-gram features (with and without stopwords removed) for question pairs data. Features will be written in a csv file in path folder. Args: path: folder containing train.csv and test.csv and to write csv features file. n: number of word for the ngram. Return: """ # Load training and test set train = pd.read_csv( os.path.join(path, 'train.csv'), sep=',', names=["id", "qid1", "qid2", "question1", "question2", "is_duplicate"]) test = pd.read_csv(os.path.join(path, 'test.csv'), sep=',', names=["id", "qid1", "qid2", "question1", "question2"]) # Drop useless columns train = train.drop(['id', 'qid1', 'qid2', 'is_duplicate'], axis=1) test = test.drop(['id', 'qid1', 'qid2'], axis=1) # Set up the tokenizer tokenizer = RegexpTokenizer(r'\w+') stop_words = stopwords.words('english') print('Applying to train...') for index, row in tqdm(train.iterrows()): question1 = train['question1'][index] question2 = train['question2'][index] # Tokenize each question tokenize1 = tokenizer.tokenize(question1) tokenize2 = tokenizer.tokenize(question2) # Create n-grams ngram1 = [gram for gram in ngrams(tokenize1, n)] ngram2 = [gram for gram in ngrams(tokenize2, n)] # Remove stopwords tokenize_no_stopword1 = [w for w in tokenize1 if not w in stop_words] tokenize_no_stopword2 = [w for w in tokenize2 if not w in stop_words] # Create n-grams without stopwords ngram_no_stopword1 = [ gram for gram in ngrams(tokenize_no_stopword1, n) ] ngram_no_stopword2 = [ gram for gram in ngrams(tokenize_no_stopword2, n) ] # Count cooccurence and distincts n-grams without stopwords cooccurence_no_stopword = 0 distinct_no_stopword = 0 for gram1 in ngram_no_stopword1: n1 = NGram(gram1) for gram2 in ngram_no_stopword2: n2 = NGram(gram2) inter = n1.intersection(n2) if len(inter) == 0: distinct_no_stopword += 1 elif len(inter) == 2: cooccurence_no_stopword += 1 train.loc[index, str(n) + 'gram_nostpwrd_cooccurence'] = cooccurence_no_stopword train.loc[index, str(n) + 'gram_nostpwrd_distinct'] = distinct_no_stopword # Count cooccurence and distincts n-grams with stopwords cooccurence = 0 distinct = 0 for gram1 in ngram1: n1 = NGram(gram1) for gram2 in ngram2: n2 = NGram(gram2) inter = n1.intersection(n2) if len(inter) == 0: distinct += 1 elif len(inter) == 2: cooccurence += 1 train.loc[index, str(n) + 'gram_cooccurence'] = cooccurence train.loc[index, str(n) + 'gram_distinct'] = distinct # Drop useless columns to save in csv train = train.drop(['question1', 'question2'], axis=1) print('Writing train features...') train.to_csv(os.path.join(path, 'train_' + str(n) + 'gram_feat.csv')) print('Applying to test...') for index, row in tqdm(test.iterrows()): question1 = test['question1'][index] question2 = test['question2'][index] # Tokenize each question tokenize1 = tokenizer.tokenize(question1) tokenize2 = tokenizer.tokenize(question2) # Create n-grams ngram1 = [gram for gram in ngrams(tokenize1, n)] ngram2 = [gram for gram in ngrams(tokenize2, n)] # Remove stopwords tokenize_no_stopword1 = [w for w in tokenize1 if not w in stop_words] tokenize_no_stopword2 = [w for w in tokenize2 if not w in stop_words] # Create n-grams without stopwords ngram_no_stopword1 = [ gram for gram in ngrams(tokenize_no_stopword1, n) ] ngram_no_stopword2 = [ gram for gram in ngrams(tokenize_no_stopword2, n) ] # Count cooccurence and distincts n-grams without stopwords cooccurence_no_stopword = 0 distinct_no_stopword = 0 for gram1 in ngram_no_stopword1: n1 = NGram(gram1) for gram2 in ngram_no_stopword2: n2 = NGram(gram2) inter = n1.intersection(n2) if len(inter) == 0: distinct_no_stopword += 1 elif len(inter) == 2: cooccurence_no_stopword += 1 test.loc[index, str(n) + 'gram_nostpwrd_cooccurence'] = cooccurence_no_stopword test.loc[index, str(n) + 'gram_nostpwrd_distinct'] = distinct_no_stopword # Count cooccurence and distincts n-grams with stopwords cooccurence = 0 distinct = 0 for gram1 in ngram1: n1 = NGram(gram1) for gram2 in ngram2: n2 = NGram(gram2) inter = n1.intersection(n2) if len(inter) == 0: distinct += 1 elif len(inter) == 2: cooccurence += 1 test.loc[index, str(n) + 'gram_cooccurence'] = cooccurence test.loc[index, str(n) + 'gram_distinct'] = distinct test = test.drop(['question1', 'question2'], axis=1) print('Writing test features...') test.to_csv(os.path.join(path, 'test_' + str(n) + 'gram_feat.csv')) print('CSV written ! see: ', path, " | suffix: ", "_" + str(n) + "gram_feat.csv")