def generate_cooccurence_distinct_ngram(path, n=2):
    """
    Generate n-gram features (with and without stopwords removed) for question pairs data. 
    Features will be written in a csv file in path folder.

    Args:
        path: folder containing train.csv and test.csv and to write csv features file.
        n: number of word for the ngram.

    Return:
        
    """

    # Load training and test set
    train = pd.read_csv(
        os.path.join(path, 'train.csv'),
        sep=',',
        names=["id", "qid1", "qid2", "question1", "question2", "is_duplicate"])
    test = pd.read_csv(os.path.join(path, 'test.csv'),
                       sep=',',
                       names=["id", "qid1", "qid2", "question1", "question2"])

    # Drop useless columns
    train = train.drop(['id', 'qid1', 'qid2', 'is_duplicate'], axis=1)
    test = test.drop(['id', 'qid1', 'qid2'], axis=1)

    # Set up the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = stopwords.words('english')

    print('Applying to train...')
    for index, row in tqdm(train.iterrows()):

        question1 = train['question1'][index]
        question2 = train['question2'][index]

        # Tokenize each question
        tokenize1 = tokenizer.tokenize(question1)
        tokenize2 = tokenizer.tokenize(question2)

        # Create n-grams
        ngram1 = [gram for gram in ngrams(tokenize1, n)]
        ngram2 = [gram for gram in ngrams(tokenize2, n)]

        # Remove stopwords
        tokenize_no_stopword1 = [w for w in tokenize1 if not w in stop_words]
        tokenize_no_stopword2 = [w for w in tokenize2 if not w in stop_words]

        # Create n-grams without stopwords
        ngram_no_stopword1 = [
            gram for gram in ngrams(tokenize_no_stopword1, n)
        ]
        ngram_no_stopword2 = [
            gram for gram in ngrams(tokenize_no_stopword2, n)
        ]

        # Count cooccurence and distincts n-grams without stopwords
        cooccurence_no_stopword = 0
        distinct_no_stopword = 0
        for gram1 in ngram_no_stopword1:
            n1 = NGram(gram1)
            for gram2 in ngram_no_stopword2:
                n2 = NGram(gram2)
                inter = n1.intersection(n2)
                if len(inter) == 0:
                    distinct_no_stopword += 1
                elif len(inter) == 2:
                    cooccurence_no_stopword += 1

        train.loc[index, str(n) +
                  'gram_nostpwrd_cooccurence'] = cooccurence_no_stopword
        train.loc[index,
                  str(n) + 'gram_nostpwrd_distinct'] = distinct_no_stopword

        # Count cooccurence and distincts n-grams with stopwords
        cooccurence = 0
        distinct = 0
        for gram1 in ngram1:
            n1 = NGram(gram1)
            for gram2 in ngram2:
                n2 = NGram(gram2)
                inter = n1.intersection(n2)
                if len(inter) == 0:
                    distinct += 1
                elif len(inter) == 2:
                    cooccurence += 1

        train.loc[index, str(n) + 'gram_cooccurence'] = cooccurence
        train.loc[index, str(n) + 'gram_distinct'] = distinct

    # Drop useless columns to save in csv
    train = train.drop(['question1', 'question2'], axis=1)
    print('Writing train features...')
    train.to_csv(os.path.join(path, 'train_' + str(n) + 'gram_feat.csv'))

    print('Applying to test...')
    for index, row in tqdm(test.iterrows()):

        question1 = test['question1'][index]
        question2 = test['question2'][index]

        # Tokenize each question
        tokenize1 = tokenizer.tokenize(question1)
        tokenize2 = tokenizer.tokenize(question2)

        # Create n-grams
        ngram1 = [gram for gram in ngrams(tokenize1, n)]
        ngram2 = [gram for gram in ngrams(tokenize2, n)]

        # Remove stopwords
        tokenize_no_stopword1 = [w for w in tokenize1 if not w in stop_words]
        tokenize_no_stopword2 = [w for w in tokenize2 if not w in stop_words]

        # Create n-grams without stopwords
        ngram_no_stopword1 = [
            gram for gram in ngrams(tokenize_no_stopword1, n)
        ]
        ngram_no_stopword2 = [
            gram for gram in ngrams(tokenize_no_stopword2, n)
        ]

        # Count cooccurence and distincts n-grams without stopwords
        cooccurence_no_stopword = 0
        distinct_no_stopword = 0
        for gram1 in ngram_no_stopword1:
            n1 = NGram(gram1)
            for gram2 in ngram_no_stopword2:
                n2 = NGram(gram2)
                inter = n1.intersection(n2)
                if len(inter) == 0:
                    distinct_no_stopword += 1
                elif len(inter) == 2:
                    cooccurence_no_stopword += 1

        test.loc[index, str(n) +
                 'gram_nostpwrd_cooccurence'] = cooccurence_no_stopword
        test.loc[index,
                 str(n) + 'gram_nostpwrd_distinct'] = distinct_no_stopword

        # Count cooccurence and distincts n-grams with stopwords
        cooccurence = 0
        distinct = 0
        for gram1 in ngram1:
            n1 = NGram(gram1)
            for gram2 in ngram2:
                n2 = NGram(gram2)
                inter = n1.intersection(n2)
                if len(inter) == 0:
                    distinct += 1
                elif len(inter) == 2:
                    cooccurence += 1

        test.loc[index, str(n) + 'gram_cooccurence'] = cooccurence
        test.loc[index, str(n) + 'gram_distinct'] = distinct

    test = test.drop(['question1', 'question2'], axis=1)
    print('Writing test features...')
    test.to_csv(os.path.join(path, 'test_' + str(n) + 'gram_feat.csv'))

    print('CSV written ! see: ', path, " | suffix: ",
          "_" + str(n) + "gram_feat.csv")