예제 #1
0
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [
        EnglishStemmer().stem(token) if stem else token
        for token in wordpunct_tokenize(
            re.sub('[%s]' % re.escape(string.punctuation), ' ',
                   text.encode(encoding='ascii', errors='ignore')))
        if not token.isdigit() and not token in stop_words
    ]
예제 #2
0
def _bow(train, test, max_features=1000):
    """
    bag-of-words encoding helper function
    """
    from sklearn.feature_extraction.text import CountVectorizer
    from nltk.stem import WordNetLemmatizer
    from nltk.stem.porter import PorterStemmer as EnglishStemmer
    from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize

    x_train, y_train = train
    x_test, y_test = test

    stemmer = EnglishStemmer()
    lemmatizer = WordNetLemmatizer()
    for i in range(len(x_train)):
        x_train[i] = " ".join([
            lemmatizer.lemmatize(stemmer.stem(token.lower()))
            for token in wordpunct_tokenize(
                re.sub("[%s]" % re.escape(string.punctuation), "", x_train[i]))
        ])

    vectorizer_train = CountVectorizer(
        strip_accents="ascii",
        stop_words="english",
        token_pattern=r"(?u)\b\w[a-z]\w+[a-z]\b",
        max_features=max_features,
        vocabulary=None,
        dtype="float32",
    )
    x_train = vectorizer_train.fit_transform(x_train).toarray()

    vocab_train = vectorizer_train.get_feature_names()
    vectorizer_test = CountVectorizer(
        strip_accents="ascii",
        stop_words="english",
        token_pattern=r"(?u)\b\w[a-z]\w+[a-z]\b",
        max_features=max_features,
        vocabulary=vocab_train,
        dtype="float32",
    )
    x_test = vectorizer_test.fit_transform(x_test).toarray()

    # remove documents with no words
    r = np.where(x_train.sum(axis=1) > 0.0)[0]
    x_train = x_train[r, :]
    y_train = y_train[r]

    r = np.where(x_test.sum(axis=1) > 0.0)[0]
    x_test = x_test[r, :]
    y_test = y_test[r]

    return (x_train, y_train), (x_test, y_test), vocab_train
예제 #3
0
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text)):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words
예제 #4
0
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if (not token.isdigit()) and (not token in stop_words):
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words
def __stem_doc(doc_details):
    # Import nltk tools
    from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize
    # from nltk.stem.snowball import EnglishStemmer
    from nltk.stem.porter import PorterStemmer as EnglishStemmer
    idx, doc = doc_details
    if idx % 100 == 0:
        print "Processed doc " + str(idx)
    if doc.endswith('.txt'):
        d = open(doc).read()
        stemmer = EnglishStemmer()  # This method only works for english documents.
        # Stem, lowercase, substitute all punctuations, remove stopwords.
        attribute_names = [stemmer.stem(token.lower()) for token in wordpunct_tokenize(
            re.sub('[%s]' % re.escape(string.punctuation), '', d.decode(encoding='UTF-8', errors='ignore'))) if
                           token.lower() not in stopwords.get_stopwords()]
        s.dump(attribute_names, open(doc.replace(".txt", ".p"), "wb"))
예제 #6
0
def clean_and_tokenize(stem):
    para = readingfiles()
    index = np.arange(len(para))
    np.random.shuffle(index)
    new_sents = []
    for i in index.tolist():
        new_sents.append(para[i])
    sents = new_sents
    word_freqs = collections.Counter()
    sent_lens = []
    parsed_sentences = []
    # lst_punctuation = str.maketrans('', '', string.punctuation)
    for sent in sents:

        sent = re.sub('[^a-zA-Z]', ' ', str(sent))
        sent = re.sub(' +', ' ', sent)
        parsed_words = []
        # print(sent)
        # print(len(sent.strip(' ')))
        for word in nltk.word_tokenize(
                re.sub('[%s]' % re.escape(string.punctuation), '', sent)):
            if ~is_number(word) and word.strip().lower(
            ) not in stop_words and word.isalpha() and len(word) > 2:
                if stem:
                    try:
                        w = EnglishStemmer().stem(word)
                        if w in stop_words:
                            w = ''
                            w = re.sub(' +', '', w)
                    except Exception as e:
                        w = word.strip().lower()
                else:
                    w = word.strip().lower()
                word_freqs[w] += 1
                parsed_words.append(w)
                # print(len(parsed_words))
                # print(parsed_words)
        if len(parsed_words) > 3:
            sent_lens.append(len(parsed_words))
            parsed_sentences.append(" ".join(parsed_words))
        # parsed_sentences.append(parsed_words)
    return sent_lens, parsed_sentences, word_freqs
예제 #7
0
def load_data(corpus_path, recursive=False):
    word_freq = defaultdict(lambda: 0) # count the number of times a word appears in a corpus
    doc_word_freq = defaultdict(dict) # count the number of times a word appears in a doc
    files = get_all_files(corpus_path, recursive)

    # word_tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # match only alphabet characters
    stemmer = EnglishStemmer()  # This method only works for english text
    try:
        stopword_path = 'patterns/english_stopwords.txt'
        cached_stop_words = load_stopwords(os.path.join(os.path.split(__file__)[0], stopword_path))
        print 'Loaded %s' % stopword_path
    except:
        from nltk.corpus import stopwords
        cached_stop_words = stopwords.words("english")
        print 'Loaded nltk.corpus.stopwords'

    for filename in files:
        try:
            with open(filename, 'r') as fp:
                text = fp.read().lower()
                # words = [word for word in word_tokenizer.tokenize(text) if word not in cached_stop_words]
                # remove punctuations, stopwords and *unnecessary digits*, stemming
                words = [stemmer.stem(token) for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
                        not token.isdigit() and not token in cached_stop_words]

                for i in range(len(words)):
                    # doc-word frequency
                    # doc_name = os.path.basename(filename)
                    parent_name, child_name = os.path.split(filename)
                    doc_name = os.path.split(parent_name)[-1] + '_' + child_name
                    try:
                        doc_word_freq[doc_name][words[i]] += 1
                    except:
                        doc_word_freq[doc_name][words[i]] = 1
                    # word frequency
                    word_freq[words[i]] += 1
        except Exception as e:
            raise e

    return word_freq, doc_word_freq
def stem_acceptance_list(path):
    """
    Stem the acceptance list given by the path. This should be done before data preparation for that specific list.

    @param path: The path to the acceptance list.
    """
    global EnglishStemmer
    from nltk.stem.porter import PorterStemmer as ES

    EnglishStemmer = ES

    acceptance_lst = open(path).read().replace(" ", "").split("\n")
    stemmer = EnglishStemmer()
    acceptance_lst_stemmed = []
    for word in acceptance_lst:
        acceptance_lst_stemmed.append(stemmer.stem(word.lower()))

    f = open(env_paths.get_acceptance_lst_path(), 'w')
    for w in acceptance_lst_stemmed[:-1]:
        f.write(w + "\n")
    f.write(acceptance_lst_stemmed[-1])
    f.close()