import nltk.stem.porter as pt import nltk.stem.lancaster as lc import nltk.stem.snowball as sb words = [ 'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision' ] pt_stemmer = pt.PorterStemmer() # 波特词干提取器,偏宽松 lc_stemmer = lc.LancasterStemmer() # 朗卡斯特词干提取器,偏严格 sb_stemmer = sb.SnowballStemmer('english') # 思诺博词干提取器,偏中庸 for word in words: pt_stem = pt_stemmer.stem(word) lc_stem = lc_stemmer.stem(word) sb_stem = sb_stemmer.stem(word) print('%8s %8s %8s %8s' % (word, pt_stem, lc_stem, sb_stem))
class Word: nonalpha_regex = r'[^a-zA-Z]' numeric_regex = r'[0-9]' html_regex = r'<.*?>' stemmer = snowball.SnowballStemmer("english") word_corpus = set(words.words()) stop_words = { 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'both', 'but', 'by', 'cant', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his' 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'with' 'won\'t', 'would', 'would', 'would', 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves' } @staticmethod def strip(word): return re.sub(Word.html_regex, '', word) @staticmethod def sanitize(word): return re.sub(Word.nonalpha_regex, '', word) @staticmethod def normalize(word): return word.lower() @staticmethod def stem(word): return Word.stemmer.stem(word) @staticmethod def process(word): word = Word.normalize(word) word = Word.strip(word) word = Word.sanitize(word) word = Word.stem(word) return word @staticmethod def is_stop(word): return word in Word.stop_words @staticmethod def is_numeric(word): return re.match(Word.numeric_regex, word) is not None @staticmethod def is_html(word): return re.match(Word.html_regex, word) is not None @staticmethod def is_word(word): return word in Word.word_corpus
def main(word, keepNone=False): ''' input: word(str):input word --------------------- return: meanExamList(list): [(mean1:[mean1exm1,mean1exm2,..]),(mean2,[mean2exam1,...])] ''' print("connecting to the website ...") myUrl = "https://www.macmillandictionary.com/dictionary/british/" + word res = request.urlopen(myUrl) res.encoding = 'utf-8' print("finding needed information ...") soupStr = bs(res.read(), features="lxml") olBsL = soupStr.find_all("ol", class_="senses") if len(olBsL) > 0: olBs = olBsL[0] else: print("problems in '", word, "':no enough meanings") tempWord = word sb_stemmer = sb.SnowballStemmer("english") word = sb_stemmer.stem(word) if tempWord != word: return main(word) else: return [] liBsList = olBs.find_all("li") print("finding meanings and examples ...") meaningList = [] exampleList = [] for liItem in liBsList: if len(liItem.find_all("div", class_="SENSE")) > 0: if len(liItem.\ find_all("div",class_="SENSE")[0].\ find_all("span",class_="DEFINITION"))>0: liDivItem=liItem.\ find_all("div",class_="SENSE")[0].\ find_all("span",class_="DEFINITION")[0].\ text elif len(liItem.\ find_all("div",class_="SENSE")[0].\ find_all("span",class_="GREF-ENTRY"))>0: liDivItem=liItem.\ find_all("div",class_="SENSE")[0].\ find_all("span",class_="GREF-ENTRY")[0].\ find_all("a")[0].text if liDivItem == None: print("problems in", word, ": None") return [("no means", "no examples")] else: return main(liDivItem, keepNone=False) elif len(liItem.\ find_all("div",class_="SENSE")[0].\ find_all("div",class_="sideboxbody"))>0: liDivItem=liItem.\ find_all("div",class_="SENSE")[0].\ find_all("div",class_="sideboxbody")[0].\ find_all("a")[0].text if liDivItem == None: print("problems in", word, ": None") return [("no means", "no examples")] else: return main(liDivItem, keepNone=False) else: return [("no means", "no examples")] meaningList.append(liDivItem) if len(liItem.find_all("div", class_="SENSE")) > 0: try: liDivItem=liItem.\ find_all("div",class_="SENSE")[0].\ find_all("p",class_="EXAMPLE") exampleList.append( [liDivItemItem.text for liDivItemItem in liDivItem]) except IndexError: pass meanExamList = list(zip(meaningList, exampleList)) if len(meanExamList[0][1]) == 0: print("problems in '", word, "':no enough examples. replacing examples with meanings") meanExamList = [(meanExamItem[0], [meanExamItem[0]]) for meanExamItem in meanExamList] tempMeanExamList = [] if keepNone == False: for row in meanExamList: if len(row[1]) != 0: tempMeanExamList.append(row) meanExamList = tempMeanExamList return meanExamList
def __init__(self): self.stemmer = snowball.SnowballStemmer("english") self.Porter_stemmer = PorterStemmer()
def run(self): global _independent_transformers self.tokenzier = treebank.TreebankWordTokenizer() self.stemmer = snowball.SnowballStemmer('english') train_data = rf_dataset.Dataset().load_all( 'train', as_df=True)[['question1_clean', 'question2_clean']] test_data = rf_dataset.Dataset().load_all( 'test', as_df=True)[['question1_clean', 'question2_clean']] all_data = pandas.concat([train_data, test_data], 0) all_q1 = list(all_data['question1_clean']) all_t1 = list( tqdm(multiprocessing.Pool().imap(self.tokenize, all_q1, chunksize=5000), total=len(all_q1), desc='Tokenizing: 1')) all_q2 = list(all_data['question2_clean']) all_t2 = list( tqdm(multiprocessing.Pool().imap(self.tokenize, all_q2, chunksize=5000), total=len(all_q2), desc='Tokenizing: 2')) all_indep_dists = list( tqdm(multiprocessing.Pool().imap(transform, zip(all_q1, all_q2, all_t1, all_t2), chunksize=5000), total=len(all_q1), desc='Computing distances')) all_df = pandas.DataFrame(all_indep_dists) print('Loading dependent transforms') dependent_transformers = { 'word_mover': WordMoverDistance(), 'sentiment': SentimentDifference() } print('Finished loading!') for name, fn in dependent_transformers.items(): dist = [ fn(q1, q2, t1, t2) for q1, q2, t1, t2 in tqdm(zip(all_q1, all_q2, all_t1, all_t2), total=len(all_q1), desc=name) ] if isinstance(dist[0], dict): frame = pandas.DataFrame.from_dict(dist, orient='columns') for col in frame: all_df[name + '_' + col] = frame[col] else: all_df[name] = dist self.output().makedirs() train_dists = all_df.iloc[:train_data.shape[0]] test_dists = all_df.iloc[train_data.shape[0]:] train_dists.to_msgpack(_train_loc) test_dists.to_msgpack(_test_loc) little_cls = ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1) little_cls.fit( train_dists.clip(-10000, 10000).values, rf_dataset.Dataset().load_all('train', as_df=True).is_duplicate.values) print( pandas.Series(little_cls.feature_importances_, train_dists.columns).sort_values()) with self.output().open('w') as f: f.write( str( pandas.Series(little_cls.feature_importances_, train_dists.columns).sort_values())) f.write("\n")
def BuildInvertedIndex(document_paths): # In-memory indexer for creating inverted index stemmer = snowball.SnowballStemmer('english') DocumentIndex = {} # {key = doc_id: value = (url, doc_path)} InvertedIndex = { } # Inverted list storage (dictionary of tokens/words/n-grams + posting lists) n = 0 # Document numbering for document_path in document_paths: # Read json file which contains ['url', 'content', 'encoding'] for a document with open(document_path, 'r') as fh: json_object = json.load(fh) url, pageContent, encoding = json_object["url"], json_object[ "content"], json_object["encoding"] # Ignore urls with fragments if urlparse(url).fragment != "": continue soup = BeautifulSoup(pageContent, 'lxml') text = soup.get_text() # check if the page contains any text if text == '': continue n += 1 print(f"Indexing document #{n}") DocumentIndex[n] = (url, document_path) tokens = tokenize(text) # tokenize text in html document tokenFrequency = get_token_frequency(tokens) tokens = set(tokens) # remove duplicate tokens HTML_tag_fields = get_HTML_tag_fields(soup) for term_position, token in enumerate(tokens): # Check if a PostingList is present in the inverted index, # Add the new {token : PostingList} to inverted index otherwise. try: posting_list = InvertedIndex[stemmer.stem(token)] # Check if a Posting is present in the posting_list, # Add the new Posting to posting_list otherwise. try: # If Posting 'n' is present in the posting_list, # Append term position to posting.term_postitions posting = posting_list[n] posting.append_term_position(term_position) except IndexError: # if Posting 'n' is not present in the posting_list posting_list.append( Posting(docid=n, tf=tokenFrequency[token], fields=get_posting_fields( HTML_tag_fields, token), termPosition=term_position)) except KeyError: InvertedIndex[stemmer.stem(token)] = PostingList( Posting(docid=n, tf=tokenFrequency[token], fields=get_posting_fields(HTML_tag_fields, token), termPosition=term_position)) return DocumentIndex, InvertedIndex
vectorizer = CountVectorizer(min_df=0) vectorizer.fit(corpusList) vocabList = vectorizer.get_feature_names() # wordFqArr=np.array([word[1] for word in vectorizer.vocabulary_.items()]) # wordFqArr=(wordFqArr-np.mean(wordFqArr)*np.ones(wordFqArr.shape))/np.std(wordFqArr) # print(list(vectorizer.vocabulary_.items())) # print("mean of word frequence:",np.mean(wordFqArr)) # print("standard variance of word frequence:",np.std(wordFqArr)) # plt.hist(wordFqArr) # plt.show() print("getting mean-example list ...") meanExamList = [] wordMeanExamDict = {} sb_stemmer = sb.SnowballStemmer("english") for word in tqdm.tqdm(vocabList): if len(re.findall("[0-9]+", word)) > 0: continue if word not in wordMeanExamDict.keys(): wordMeanExamDict[word] = TC.main(word.strip()) stemedWord = sb_stemmer.stem(word) if stemedWord not in wordMeanExamDict.keys(): wordMeanExamDict[stemedWord] = TC.main(stemedWord.strip()) print("saving data ...") with open("data/GANDict.pkl", "wb+") as GANDictFile: pkl.dump(wordMeanExamDict, GANDictFile) print("loading data ...") with open("data/GANDict.pkl", "rb") as GANDictFile:
import xgboost as xgb import cPickle as pickle from string import punctuation from nltk import word_tokenize from nltk.stem import snowball import numpy as np import pandas as pd import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.style.use('ggplot') stemmer = snowball.SnowballStemmer("english") def load_data(filename='../labeledhate_5cats.p'): ''' Load data into a data frame for use in running model ''' return pickle.load(open(filename, 'rb')) def stem_tokens(tokens, stemmer): '''Stem the tokens.''' stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed
print("Loading Data") df = load_data() print("Splitting Data") X_train, X_test, y_train, y_test = splitdata(df, classes) #relabel the output for multiclass roc plot, score ylabel_bin = label_binarize(y_test.astype(int), classes=[0, 1, 2, 3, 4], sparse_output=False) ### Loop through # max_features? --> Use 5000 as a starting point, at least for now. ### ### Use english stop words ### Loop Through Vectorizer, stemmer/tokenizing ### vect_options = ['Count', 'Hash', 'Tfidf'] stemmer_options = [snowball.SnowballStemmer("english")] #Note - wordnet.WordNetLemmatizer() has no .stem option & doesn't fit the format of this code. token_options = [None, tokenize] # token_options = [tokenize] for token in token_options: for stemmer in stemmer_options: for vect in vect_options: print('For vect {0}, stemmer {1} & token {2}'.format( vect, stemmer, token)) print('Vectorizing') vectfit_X_train, vectfit_X_test = vectorizer( vectchoice=vect, stopwords='english', tokenize_me=token) print('Classifying') xg_train = xgb.DMatrix(vectfit_X_train, label=y_train)
def train(X_train, s_train, y_train): # count_vect = CountVectorizer() # X_train_counts = count_vect.fit_transform(twenty_train.data) # tfidf_transformer = TfidfTransformer() # X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # TODO: do n-grams and add NLTK features! # using default SVM params # penalty=’l2’, loss=’squared_hinge’, dual=True, tol=0.0001, C=1.0, # multi_class=’ovr’, fit_intercept=True, intercept_scaling=1, # class_weight=None, verbose=0, random_state=None, max_iter=1000 stemmer = sb.SnowballStemmer('english') swlist = sw.words('english') swlist += [stemmer.stem(w) for w in swlist] swlist += ["'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure', 'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha', 'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv'] #complained about not having these as stop words pubs = ['buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox', 'guardian','review', 'theatlant'] punct = []#[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now swlist += pubs swlist += punct if sys.argv[4].lower()=='true': tkzr = StemTokenizer() else: tkzr = None if sys.argv[5].lower()!='true': swlist = [] if sys.argv[1].lower()=='rf': classTuple = ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced')) elif sys.argv[1].lower()=='svm': classTuple = ('svm', LinearSVC(class_weight='balanced')) elif sys.argv[1].lower()=='knn': classTuple = ('knn', KNeighborsClassifier(n_neighbors=5, metric='cosine')) else: sys.exit('unknown classifier') #what features are we using? if sys.argv[7].lower()=='word': text_clf = Pipeline([('vect', AugmentedCountVectorizer(stop_words=swlist, tokenizer=tkzr)), ('tfidf', TfidfTransformer()), classTuple]) elif sys.argv[7].lower()=='topic': text_clf = Pipeline([('vect', AugmentedCountVectorizer(stop_words=swlist, tokenizer=tkzr)), ('tfidf', LatentDirichletAllocation(n_components=50)), classTuple]) elif sys.argv[7].lower()=='style': text_clf = Pipeline([('vect', RemoveWords()), classTuple]) elif sys.argv[7].lower()=='all': text_clf = Pipeline([('vect', AugmentedCountVectorizer(stop_words=swlist, tokenizer=tkzr, useStyleFeatures=True)), ('tfidf', AllFeatureTransformer(n_components=50)), classTuple]) else: sys.exit('unknown features') text_clf = text_clf.fit((X_train, s_train), y_train) # TODO: save model return text_clf
from __future__ import print_function import numpy as np import nltk import nltk.corpus as corpus import nltk.stem.snowball as snowball from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from collections import Counter from keras.preprocessing.text import one_hot from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer import re import random ptbr_stem = snowball.SnowballStemmer('portuguese').stem def create_lexicon(dataset): global max_doc_length lexicon = [] with open(dataset, 'r') as f: contents = f.readlines()[7:] for line in contents: document = line.split(',')[-1] document = re.sub("'", '', document, flags = re.M) document = word_tokenize(document) for word in document: if word.lower() not in corpus.stopwords.words('portuguese'): lexicon.append(word.lower())
def stem_tokenizer(doc): tokens = word_tokenize(doc) stemmer = snowball.SnowballStemmer("english", ignore_stopwords=True) stemmed_tokens = [stemmer.stem(word) for word in tokens] return ([tok.lower() for tok in stemmed_tokens if tok.isalpha()])
@author: Administrator """ import nltk.stem.porter as pt import nltk.stem.lancaster as lc import nltk.stem.snowball as sb words = [ 'table', 'probably', 'wolves', 'dreamt', 'palying', 'is', 'beaches', 'envision', 'grounded' ] ''' 提取词干 ''' stemmer_porter = pt.PorterStemmer() # 偏宽松 stemmer_lancaster = lc.LancasterStemmer() # 偏严格 stemmer_snowball = sb.SnowballStemmer('english') # 适中 for word in words: pstem = stemmer_porter.stem(word) lstem = stemmer_lancaster.stem(word) sstem = stemmer_snowball.stem(word) print('{:10} {:10} {:10} {:10}'.format(word, pstem, lstem, sstem)) #table tabl tabl tabl #probably probabl prob probabl #wolves wolv wolv wolv #dreamt dreamt dreamt dreamt #palying pali paly pali #is is is is #beaches beach beach beach #envision envis envid envis #grounded ground ground ground
from extractors.common.extraction_keys import KEYS """ Post processor function which tries to find given place name from list of manually fixed place names so that Finnish difficult conjugations or typos can be resolved to a correct Place name. As a bonus fills in region when it is recorded in the Place name list. List itself was generated from csv with karelian-db repository's fix_place_names script. This processor should be run for Place names which are in conjugated format, for example birth places, which in karelian books are usually written in form of "Ahlaisissa". Some conjugations are difficult to deal with naive Snowball stemmer and many OCR typos also seem to trip stemmer. Therefore a list of around 2500 place names were corrected by hand and rest should be possible to merge with stemmer and string distance metric such as Jaro-Winkler. """ manually_fixed_place_names_file = open('support_datasheets/place_names_with_alternative_forms.json', encoding='utf8') manually_fixed_place_names = json.load(manually_fixed_place_names_file) stemmer = snowball.SnowballStemmer('finnish') manual_place_name_index = {} """ Every place name should be found from existing list of place names when searched by conservative Jaro-Winkler distance of stemmed form of the name. This should minimize the problem of creating useless unique place names to the result set just because same place name has slight difference in the end of the word such as conjugation. """ list_of_known_places_file = open('support_datasheets/place_name_list.csv', encoding='utf8') list_of_known_places = list(csv.DictReader(list_of_known_places_file)) place_list_index = collections.OrderedDict() # Create a hash map which has as a key different writing styles of place names # which refer to correct data entry for those place names for key, item in manually_fixed_place_names.items(): manual_place_name_index[key] = item
def __init__(self): self.stemmer = sb.SnowballStemmer('english')
import warnings warnings.filterwarnings('ignore', category=UserWarning) import nltk.tokenize as tk import nltk.corpus as nc import nltk.stem.snowball as sb import gensim.models.ldamodel as gm import gensim.corpora as gc doc = [] with open('../data/topic.txt', 'r') as f: for line in f.readlines(): doc.append(line[:-1]) tokenizer = tk.RegexpTokenizer(r'\w+') stopwords = nc.stopwords.words('english') stemmer = sb.SnowballStemmer('english') lines_tokens = [] for line in doc: tokens = tokenizer.tokenize(line.lower()) line_tokens = [] for token in tokens: if token not in stopwords: token = stemmer.stem(token) line_tokens.append(token) lines_tokens.append(line_tokens) dic = gc.Dictionary(lines_tokens) bow = [] for line_tokens in lines_tokens: row = dic.doc2bow(line_tokens) bow.append(row) n_topics = 2 model = gm.LdaModel(bow, num_topics=n_topics, id2word=dic, passes=25) topics = model.print_topics(num_topics=n_topics, num_words=4)
def run_classifer(X_train, y_train, X_test, y_test): # s_train = np.array(s_train) # samples x features # s_test = np.array(s_test) num_labels = 15 batch_size = 100 stemmer = sb.SnowballStemmer('english') swlist = sw.words('english') swlist += [stemmer.stem(w) for w in swlist] swlist += ["'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure', 'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha', 'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv'] #complained about not having these as stop words pubs = ['buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox', 'guardian','review', 'theatlant'] punct = []#[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now swlist += pubs swlist += punct if sys.argv[3].lower()=='true': tkzr = StemTokenizer() else: tkzr = None if sys.argv[4].lower()!='true': swlist = [] #what features are we using? if sys.argv[6].lower()=='word': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train = tfidf_transformer.transform(X_train) X_test = tfidf_transformer.transform(X_test) elif sys.argv[6].lower()=='topic': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train = lda_model.transform(X_train) X_test = lda_model.transform(X_test) elif sys.argv[6].lower()=='style': X_train = csr_matrix(s_train) X_test = csr_matrix(s_test) elif sys.argv[6].lower()=='all': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train_tf = tfidf_transformer.transform(X_train) X_test_tf = tfidf_transformer.transform(X_test) print(type(X_train_tf)) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train_lda = lda_model.transform(X_train) X_test_lda = lda_model.transform(X_test) print(type(X_train_lda)) X_train = csr_matrix(sparse.hstack([X_train_tf, csr_matrix(X_train_lda), csr_matrix(s_train)])) X_test = csr_matrix(sparse.hstack([X_test_tf, csr_matrix(X_test_lda), csr_matrix(s_test)])) print(type(X_train)) # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train) # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test) else: sys.exit('unknown features') encoder = LabelBinarizer() encoder.fit(y_train) y_train = encoder.transform(y_train) y_test = encoder.transform(y_test) # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train) # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test) # load everything back # X_train = sparse.load_npz("X_train.npz") input_dim = X_train.shape[1] model = Sequential() model.add(Dense(512, input_shape=(input_dim,))) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, batch_size=batch_size, epochs=5, verbose=1, validation_split=0.1) model.model.save(sys.argv[5] + '.h5') model = keras.models.load_model(sys.argv[5] + '.h5') score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1) print('Test accuracy:', score[1]) y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) # predicted = np.argmax(pred, axis=1) p, r, fs, s = precision_recall_fscore_support(y_test, y_pred) print(p, r, fs, s)
def __init__(self): self.stemmer = snowball.SnowballStemmer("english")
# @FileName: myNLTKStem.py # @Software: PyCharm Community Edition # @introduction: 词干提取 import nltk.stem.porter as pt import nltk.stem.lancaster as lc import nltk.stem.snowball as sb words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision' ] # 词干提取 print('=======PorterStemmer=========') stemmer = pt.PorterStemmer() for word in words: stem = stemmer.stem(word) print(stem) print('=======LancasterStemmer=========') stemmer = lc.LancasterStemmer() for word in words: stem = stemmer.stem(word) print(stem) print('=======LancasterStemmer=========') stemmer = sb.SnowballStemmer() for word in words: stem = stemmer.stem(word) print(stem)