def dutch(): from collective.classification.data.downloader import\ downloadNLTKAlpinoCorpus downloadNLTKAlpinoCorpus() from nltk.corpus import alpino alpino_sents = alpino.tagged_sents(simplify_tags=True) tagger = BrillTrigramTagger() tagger.train(alpino_sents) dump(tagger.tagger, "dutch_tagger.pickle")
def train_corpus_to_tag(): """ Train tagger on Alpino Corpus :return: model tagger <type: 'model'> """ alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) return tagger
def transform(self, reviews, y=None): number_of_adjectives = [] training_corpus = alp.tagged_sents() unitagger = UnigramTagger(training_corpus) pos_tag = unitagger.tag for review in reviews: tokens = re.findall(r"[\w']+|[.,!?;]", review) adj = 0 for token in pos_tag(tokens): if token[1] == 'adj': adj += 1 number_of_adjectives.append([adj]) return number_of_adjectives
def select_sents(x): return { 'brown_universal': brown.tagged_sents(tagset='universal'), # Accuracy: 95.12% 'brown': brown.tagged_sents(), # Accuracy: 93.66% 'conll2000_universal': conll2000.tagged_sents(tagset='universal'), # Accuracy: 95.63% 'conll2000': conll2000.tagged_sents(), # Accuracy: 94.94% 'conll2002': conll2002.tagged_sents(), # Accuracy: 91.53% 'alpino': alpino.tagged_sents(), # Accuracy: 88.79% 'dependency_treebank': dependency_treebank.tagged_sents(), # Accuracy: 90.79% 'treebank': treebank.tagged_sents(), # Accuracy: 91.44% 'indian': indian.tagged_sents(), # Accuracy: 64.41% 'else': [] # in case of an unavailable corpus }.get(x, 'else')
def tagger(self): """ Usage: training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) tagger.train(training_corpus) #sent = 'NLTK is een goeda taal voor het leren over NLP'.split() print(tagger.tag(article_text.split())) :return: """ # Load Corpus training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) # Build tagger tagger.train(training_corpus) return tagger.tag(self.string.split())
def main(file_input): test_data = pd.read_csv(str(file_input) + '.csv') # test_data = pd.read_csv(str(file_input) + '.csv', index_col='Unnamed: 0') print("Loaded .csv file Successfully") print("Missing Value Treatment : Start") # missing values Treatment while test_data.isnull().sum().values.sum() != 0: col_with_missing_val = (test_data.isnull().sum()).argmax() test_data = test_data[test_data[col_with_missing_val].notnull( )] # drop corresponding rows that has NaN values print(col_with_missing_val) print("Missing Value Treatment : Stop") print("Total Number of Samples:", test_data.shape[0]) print("Total Number of Features:", test_data.shape[1]) print("Computing Pattern Transformers: Start") # pattern transformers pattern_strictlyDigits = "^[0-9]*$" test_data["strictly_Digits"] = test_data["candidate"].str.contains( pattern_strictlyDigits, regex=True).astype(np.int64) test_data["Number_of_Digits"] = test_data['candidate'].apply( lambda x: len(re.sub("\W", "", x))) test_data["Number_of_Seprators"] = test_data['candidate'].apply( lambda x: len(re.sub("\w", "", x))) test_data["Length_of_Candidate"] = test_data['candidate'].apply( lambda x: len(x)) print("Computing Pattern Transformers: Stop") print("Computing Context Transformers: Start") # context transformers test_data["Text"] = test_data["line_before"] + test_data[ "line_at"] + test_data["line_after"] def email_match(doc): match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc)) if match != None: return 1 else: return 0 test_data["Number_of_Characters_Text"] = test_data["Text"].apply( lambda x: len(re.sub("[^a-z]", "", str(x)))) test_data["Number_of_Digits_Text"] = test_data["Text"].apply( lambda x: len(re.sub("[^0-9]+", "", str(x)))) test_data["Number_of_Separators_Text"] = test_data["Text"].apply( lambda x: len((re.sub("[\w]+", "", str(x))).replace(" ", ""))) test_data["Email_Exists"] = test_data["Text"].apply( email_match) # place 1 everywhere email found else 0 test_data["Number_of_spaces"] = test_data["Text"].apply( lambda x: str(x).count(' ')) # counts number of spaces # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language ss = SnowballStemmer("dutch", "french") def clean_data(doc): ignore = list(set(stopwords.words( 'dutch', 'french'))) # ignore the list of stopwords exl_chars = list(set(string.punctuation)) exl_chars.append('€') doc = re.sub( "[\w\.-]+@[\w\.-]+", " ", str(doc) ) # remove email ids to avoid confiltcs in vaocabulary construction doc = re.sub("\d", " ", str(doc)) doc = ''.join([ch for ch in doc if ch not in exl_chars]) words = [] for i in word_tokenize(doc): # tokenization if i not in ignore: if len(i) >= 2: # standalone letters do not add any value i = ss.stem(i) words.append(i) doc = ' '.join(list(set(words))) return doc test_data["Text"] = test_data["Text"].apply( clean_data) # tokenize, stem and lammetize # training_corpus = alp.tagged_sents() alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) def count_adj(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_adj_adv = counts['adv'] + counts['adj'] return count_adj_adv def count_nn(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_nn = counts['noun'] return count_nn def count_verb(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_verb = counts['verb'] return count_verb test_data["Adv_Adj_Count"] = test_data["Text"].apply(count_adj) test_data["NN_count"] = test_data["Text"].apply(count_nn) test_data["Verb_count"] = test_data["Text"].apply(count_verb) print("Computing Context Transformers: Stop") # load the vocabulary with open("vocab.txt", "rb") as fp: vocabulary = pickle.load(fp) print("Computing Bag of Words Vectors: Start") def build_features(doc): vector = np.zeros((1, len(vocabulary)), dtype=np.int64) for w in word_tokenize(doc): for i, word in enumerate(vocabulary): if word == w: vector[0][i] += 1 return vector bag_vectors = test_data["Text"].apply(build_features) feature_vectors = np.zeros((test_data.shape[0], len(vocabulary)), dtype=np.int64) for pos, index in enumerate(test_data.index.values): feature_vectors[pos, :] = bag_vectors[index] cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))] for col_index, col in enumerate(cols): test_data[col] = feature_vectors[:, col_index].reshape( test_data.shape[0], 1) print("Computing Bag of Words Vectors: Stop") print("Computing Location Transformers: Start") test_data["location_page_nr"] = test_data["page_nr"].apply( lambda x: 100 if x >= 50 else x) test_data["location_line_nr"] = test_data["line_nr"].apply( lambda x: 100 if x >= 50 else x) print("Computing Location Transformers: Stop") print("Loading Model...") model = tf.keras.models.load_model('model_candidate_filter.h5') model.compile(loss=tf.keras.losses.mean_squared_error, optimizer='adam', metrics=['accuracy']) print("Loaded Model Successfully!") X_test = test_data.drop([ "candidate", "Text", "label", "line_after", "line_at", "line_before", "page_nr", "line_nr" ], axis=1) X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0) yHat_proba = model.predict(X_test) yHat = np.copy(yHat_proba) yHat[yHat <= 0.5] = 0 yHat[yHat > 0.5] = 1 print("Storing Results in .csv file") confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1])) for i in range(0, yHat_proba.shape[0]): if yHat_proba[i] <= 0.5: confidence[i] = 1 - yHat_proba[i] else: confidence[i] = yHat_proba[i] results_data_frame = pd.DataFrame( columns=["Predictions", "Confidence Level"], index=test_data.index) results_data_frame["Predictions"] = yHat.astype(np.int64).ravel() results_data_frame["Confidence Level"] = np.around(confidence, decimals=4) results_data_frame.to_csv("Results_predictions_confidence_run.csv", encoding='utf-8', header=True, index=True)
def main(): training_corpus = list(alp.tagged_sents()) global tagger tagger = PerceptronTagger() tagger.train(training_corpus) num = 2138 dic = {} Xtrain = [] Ytrain = [] with open("trainGxG/GxG_News.txt") as txt: for line in txt: if line[0:8] == "<doc id=": Ytrain.append(line.split()[3][8]) string=[line.split('\"')[1]] dic[line.split('\"')[1]] = line.split()[3][8] elif line[0:6] == "</doc>": Xtrain.append(" ".join(string)) else: string.append(line) Xtest = [] with open("testGxG/GxG_News.txt") as txt: for line in txt: if line[0:8] == "<doc id=": string=[] elif "</doc>" in line: Xtest.append(" ".join(string)) else: string.append(line) Ytest = [] with open("testGxG/GxG_News_gold.txt") as text: for line in text: Ytest.append(line.split()[1]) sentences = [] for i in Xtrain[:num]: sentences.append(preprocess(i)) nlp = spacy.load('nl_core_news_sm') veclist = [] for sentence in sentences: doc = nlp(sentence) vec = doc.vector veclist.append(vec) X = np.array(veclist) clf = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None) labels = clf.fit_predict(X) pca = PCA(n_components=2).fit(X) coords = pca.transform(X) lst = [] for index, sentence in enumerate(sentences): plt.text(coords[index].tolist()[0],coords[index].tolist()[1], str(dic[sentence.split()[0]]) + str(labels[index]) + ":" + str(sentence)[0:10], fontsize=4) lst.append(str(dic[sentence.split()[0]]) + str(labels[index])) label_colors=["red", "blue", "green", "yellow", "black", "purple", "cyan"] colors = [label_colors[i] for i in labels] plt.scatter(coords[:, 0], coords[:, 1], c=colors) centroids = clf.cluster_centers_ centroid_coords = pca.transform(centroids) plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker="X", s=200, linewidth=2, c="#444d61") print(Counter(labels)) genders = [] for i,j in enumerate(sentences): if i < num: genders.append(dic[j.split()[0]]) print(Counter(genders)) print(Counter(lst)) plt.show()
'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', simplify_tags=True), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(simplify_tags=True), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)':
import numpy as np import torch from torch.autograd import Variable import pickle from collections import Counter from torch import nn import torch.nn.functional as F from nltk.tag import PerceptronTagger from nltk.corpus import alpino as alp from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import PunktSentenceTokenizer training_corpus = list(alp.tagged_sents()) tagger = PerceptronTagger(load=True) tagger.train(training_corpus) wordTokenizer = WordPunctTokenizer() sentTokenizer = PunktSentenceTokenizer() def generate_vocabulary(data, vocabulary_size): all_data = " ".join(data) print(all_data[:100]) words = [ word for sent in sentTokenizer.tokenize(all_data) for word in wordTokenizer.tokenize(sent) ] counter = Counter(words) # most_common() produces k frequently encountered # input values and their respective counts. most_common = counter.most_common(vocabulary_size) vocabulary = set([word for word, count in most_common])
def main(file_input): data_df = pd.read_csv(str(file_input) + '.csv') data_df = shuffle(data_df) print("Loaded .csv file Successfully") print("Total Number of Samples:", data_df.shape[0]) print("Total Number of Features:", data_df.shape[1]) # Missing Values # column with maximum missing values def missing_value(data_df): while data_df.isnull().sum().values.sum() != 0: col_with_missing_val = (data_df.isnull().sum()).argmax() data_df = data_df[data_df[col_with_missing_val].notnull( )] # drop corresponding rows that has NaN values print("Missing Values in Features:", col_with_missing_val) return data_df # Missing Value Treatment: print("Missing Value Treatment : Start") data_df = missing_value(data_df) print("Missing Value Treatment : Stop") print("Total Number of Samples:", data_df.shape[0]) print("Total Number of Features:", data_df.shape[1]) # pattern matcher for candidate feature # newly Added Features : Dates format, currency format, number of digits per candidate, number of separators # per candidate print("Computing Pattern Transformers: Start") pattern_strictlyDigits = "^[0-9]*$" pattern_endWithCharacters = "^\d*[\/.,@$!)(]$" # Only digits + end with special characters pattern_telephone = "^0[0-9]{12}$" pattern_vat = "^0?[0-9]{9}$" pattern_date = '^[0-3]?[0-9](\/|\,|\.|\-){1}[0-9]?[0-9](\/|\,|\.|\-){1}[0-2][0-9]{1,3}$' pattern_currency_1 = '^[0-9]\.[0-9]+\,[0-9]*$' # captures ddddd,dddd pattern_currency_2 = '^[0-9]+\,[0-9]+$' data_df['currency_filter'] = data_df['candidate'].str.contains(pattern_currency_1, regex=True).astype(np.int64)\ | data_df['candidate'].str.contains(pattern_currency_2, regex=True).astype(np.int64) data_df['dates_filter'] = data_df['candidate'].str.contains( pattern_date, regex=True).astype(np.int64) data_df["Is_strictly_Digits"] = data_df["candidate"].str.contains( pattern_strictlyDigits, regex=True).astype(np.int64) data_df["endWithCharacters"] = data_df["candidate"].str.contains( pattern_endWithCharacters, regex=True).astype(np.int64) data_df["Number_of_Digits"] = data_df['candidate'].apply( lambda x: len(re.sub("\W", "", x))) data_df["Number_of_Separators"] = data_df['candidate'].apply( lambda x: len(re.sub("\w", "", x))) data_df["Length_of_Candidate"] = data_df['candidate'].apply( lambda x: len(x)) # included the country code data_df["Telephone"] = data_df["candidate"].str.contains( pattern_telephone, regex=True).astype(np.int64) # VAT number contains 9 to 10 digits data_df["VATNumber"] = data_df["candidate"].str.contains( pattern_vat, regex=True).astype(np.int64) # drop blacklisted variables dates_index = data_df.index[data_df['dates_filter'] == 1].tolist() data_df = data_df.drop(index=dates_index, axis=0) data_df = data_df.drop("dates_filter", axis=1) currency_index = data_df.index[data_df['currency_filter'] == 1].tolist() data_df = data_df.drop(index=currency_index, axis=0) data_df = data_df.drop(["currency_filter"], axis=1) telephone_index = data_df.index[data_df['Telephone'] == 1].tolist() data_df = data_df.drop(index=telephone_index, axis=0) data_df = data_df.drop(["Telephone"], axis=1) vat_index = data_df.index[data_df['VATNumber'] == 1].tolist() data_df = data_df.drop(index=vat_index, axis=0) data_df = data_df.drop(["VATNumber"], axis=1) vat_index = data_df.index[data_df['endWithCharacters'] == 1].tolist() data_df = data_df.drop(index=vat_index, axis=0) data_df = data_df.drop(["endWithCharacters"], axis=1) print("Computing Pattern Transformers: Stop") # NLP Techniques: # Tokenization, Stemming, lemmatization, Frequency Distribution, Bag of words approach # Combine three text columns to single column - This columns contains he full text data_df["Text"] = data_df["line_before"] + data_df["line_at"] + data_df[ "line_after"] print("Computing Context Transformers: Start") # Context Transformers def email_match(doc): match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc)) if match != None: return 1 else: return 0 data_df["Number_of_Characters_Text"] = data_df["Text"].apply( lambda x: len(re.sub("[^a-z]", "", str(x)))) data_df["Number_of_Digits_Text"] = data_df["Text"].apply( lambda x: len(re.sub("[^0-9]+", "", str(x)))) data_df["Number_of_Separators_Text"] = data_df["Text"].apply(lambda x: len( (re.sub("[\w]+", "", str(x))).replace(" ", ""))) data_df["Is_Email_Exists"] = data_df["Text"].apply( email_match) # place 1 everywhere email found else 0 data_df["Number_of_spaces"] = data_df["Text"].apply( lambda x: str(x).count(' ')) # counts number of spaces, # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language ss = SnowballStemmer("dutch", "french") def clean_data(doc): ignore = list(set(stopwords.words( 'dutch', 'french'))) # ignore the list of stopwords exl_chars = list(set(string.punctuation)) exl_chars.append('€') # remove email ids to avoid conflicts in vocabulary construction doc = re.sub("[\w\.-]+@[\w\.-]+", " ", str(doc)) doc = re.sub("\d", " ", str(doc)) doc = ''.join([ch for ch in doc if ch not in exl_chars]) words = [] for i in word_tokenize(doc): # tokenization if i not in ignore: if len(i) >= 2: # standalone letters do not add any value i = ss.stem(i) words.append(i) doc = ' '.join(list(set(words))) return doc print("Cleaning Text Data: Start") data_df["Text"] = data_df["Text"].apply( clean_data) # tokenize, stem and lammetize print("Cleaning Text Data: Stop") print("Computing POS Vectors: Start") # training_corpus = alp.tagged_sents() alp_tagged_sent = list(alp.tagged_sents()) tagger = PerceptronTagger(load=False) tagger.train(alp_tagged_sent) def count_adj(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_adj_adv = counts['adv'] + counts['adj'] return count_adj_adv def count_nn(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_nn = counts['noun'] return count_nn def count_verb(doc): tags = tagger.tag(doc.split()) for tup in tags: first_3_characters = tup[0][:3] last_3_characters = tup[0][3:] if len(tags[0]) >= 3 and first_3_characters[ 0] == first_3_characters[1] == first_3_characters[2]: tags.remove(tup) if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[ 1] == last_3_characters[2]: tags.remove(tup) counts = Counter(tag for word, tag in tags) count_verb = counts['verb'] return count_verb data_df["Adv_Adj_Count"] = data_df["Text"].apply(count_adj) data_df["NN_count"] = data_df["Text"].apply(count_nn) data_df["Verb_count"] = data_df["Text"].apply(count_verb) print("Computing POS Vectors: Stop") print("Computing Vocabulary: Start") # store all the words in positive class and negative in two separate lists docs_pos = [] docs_pos.extend( word_tokenize(words) for words in data_df.Text[data_df.gold == 1]) docs_pos = list(itertools.chain(*docs_pos)) # Clean text data - remove words like --- iiiiiii, hhhhhccchhhh, abvwwwwwcgdccc for i in docs_pos: first_3_characters = i[:3] last_3_characters = i[-3:] if len(i) >= 3 and first_3_characters[0] == first_3_characters[ 1] == first_3_characters[2]: docs_pos.remove(i) if i in docs_pos and len(i) >= 3 and last_3_characters[ 0] == last_3_characters[1] == last_3_characters[2]: docs_pos.remove(i) print("Positve class words are stored successfully") all_words_pos = nltk.FreqDist(docs_pos) print("Computing vocabulary based on Positive Class") # find popular words, popular equals more than 25 times in the corpus popular_pos_words = [] for i in all_words_pos.items(): if i[1] >= 25: popular_pos_words.append(i[0]) # Filter nouns from the popular positive class words tagged_pos_words = tagger.tag(popular_pos_words) filtered_tag_pos_words_nouns = [] for word in tagged_pos_words: if word[1] == 'noun': filtered_tag_pos_words_nouns.append(word[0]) vocab_pos = list(set(filtered_tag_pos_words_nouns)) vocabulary = list(set(vocab_pos)) # save vocabulary with open("vocab.txt", "wb") as fp: pickle.dump(vocabulary, fp) print("Computing Vocabulary: Stop") print("Length of Vocabulary: ", len(vocabulary)) print("Computing Bag of Words Vectors: Start") def build_features(doc): vector = np.zeros((1, len(vocabulary)), dtype=np.int64) for w in word_tokenize(doc): for idx, vocab in enumerate(vocabulary): if vocab == w: vector[0][idx] += 1 return vector bag_vectors = data_df["Text"].apply(build_features) feature_vectors = np.zeros((data_df.shape[0], len(vocabulary)), dtype=np.int64) for pos, index in enumerate(data_df.index.values): feature_vectors[pos, :] = bag_vectors[index] cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))] for col_index, col in enumerate(cols): data_df[col] = feature_vectors[:, col_index].reshape(data_df.shape[0], 1) print("Computing Bag of Words Vectors: Stop") print("Computing Context Transformers: Stop") print("Computing Location Transformers: Start") data_df["location_page_nr"] = data_df["page_nr"].apply(lambda x: 100 if x >= 50 else x) data_df["location_line_nr"] = data_df["line_nr"].apply(lambda x: 100 if x >= 50 else x) print("Computing Location Transformers: Stop") print("Total Number of Newly Added Features:", data_df.shape[1] - 7) print("Building ML - Neural Network Model: Start") X = data_df.drop([ "candidate", "Text", "gold", "label", "line_after", "line_at", "line_before", "line_nr", "page_nr" ], axis=1) y = data_df.gold # Normalisation X = (X - X.mean(axis=0)) / X.std(axis=0) def build_model(input_shape): model = Sequential() model.add(Dense(1024, input_shape=(input_shape, ))) model.add(Activation('sigmoid')) model.add(Dense(512)) model.add(Activation('sigmoid')) model.add(Dense(128)) model.add(Activation('sigmoid')) model.add(Dense(1, activation="sigmoid")) model.compile(optimizer='adam', loss=tf.keras.losses.mean_squared_error, metrics=['accuracy']) return model # Stratified k-Fold k_fold_outer = model_selection.StratifiedKFold(n_splits=5) scores = [] split = 0 for train_index, test_index in k_fold_outer.split(X, y): X_train, X_val = X.iloc[train_index], X.iloc[test_index] y_train, y_val = y.iloc[train_index], y.iloc[test_index] model = build_model(X_train.shape[1]) history = model.fit(X_train, y_train, epochs=5, batch_size=1024, verbose=1) results = model.evaluate(X_val, y_val) scores.append(results[1]) split += 1 del model, history, results model = build_model(X.shape[1]) model.fit(X, y, verbose=0) print('Saving the Model *.h5...') model.save('model_candidate_filter.h5') yHat_proba = model.predict(X) yHat = np.copy(yHat_proba) yHat[yHat <= 0.5] = 0 yHat[yHat > 0.5] = 1 br_score = np.around(metrics.brier_score_loss(y, yHat_proba, pos_label=1), decimals=5) print("Storing Results in .csv file") confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1])) for i in range(0, yHat_proba.shape[0]): if yHat_proba[i] <= 0.5: confidence[i] = 1 - yHat_proba[i] else: confidence[i] = yHat_proba[i] results_data_frame = pd.DataFrame( columns=["Predictions", "Confidence Level"], index=data_df.index) results_data_frame["Predictions"] = yHat.astype(np.int64).ravel() results_data_frame["Confidence Level"] = np.around(confidence, decimals=4) results_data_frame.to_csv("Results_predictions_confidence_train.csv", encoding='utf-8', header=True, index=True) return np.mean(scores), br_score
'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='simple'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='simple'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='simple'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)':
'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='universal'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='universal'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)':
from nltk.corpus import brown, conll2000, alpino, floresta, gutenberg from nltk.tag import hmm from nltk.util import unique_list from nltk.probability import * from nltk import ConditionalProbDist from nltk import ConditionalFreqDist from collections import Counter from HMM import * # Load the Training and Test Sentences print("Downloading Training Sentences from Corpus") trainingSentences_brown = brown.tagged_sents(tagset="universal")[:10000] trainingSentences_conll2000 = conll2000.tagged_sents()[:10000] trainingSentences_alpino = alpino.tagged_sents()[:10000] trainingSentences_floresta = floresta.tagged_sents()[:10000] print "Done!" print("Downloading Test Sentences from Corpus") testSentences_brown = brown.tagged_sents(tagset="universal")[10000:10500] testSentences_conll2000 = conll2000.tagged_sents()[10000:10500] testSentences_alpino = alpino.tagged_sents()[10000:10500] testSentences_floresta = floresta.tagged_sents()[10000:10500] print "Done!" # Extracts words and tags from Sentences def extractWords_and_Tags(sentences): words = {} tags = {}
def __init__(self): super(CountAdjectives, self).__init__() self.tagger = PerceptronTagger(load=True) training_corpus = list(alpino.tagged_sents()) self.tagger.train(training_corpus)
import nltk from nltk.corpus import alpino as alp from nltk.tag import UnigramTagger, BigramTagger alpino = alp.tagged_sents() unitagger = UnigramTagger(alpino) bitagger = BigramTagger(alpino, backoff=unitagger) pos_tag = bitagger.tag sent = 'Een telescoop is een instrument dat een astronoom gebruikt .'.split() print(pos_tag(sent))
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents( categories="humor", tagset="universal" ), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts( tagset="universal" ), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( tagset="universal" ), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( tagset="universal" ), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( tagset="universal" ), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( files="hindi.pos", tagset="universal" ), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( tagset="universal" ), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents( tagset="universal" ),
categories=["news", "editorial", "reviews"], tagset="simple" ), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="simple" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results
def train_dutch_tagger(): training_corpus = alp.tagged_sents() unitagger = UnigramTagger(training_corpus) bitagger = BigramTagger(training_corpus, backoff=unitagger) pos_tag = bitagger.tag return pos_tag
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="universal"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="universal"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="universal"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="universal"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="universal"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="universal"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="universal"), "Spanish: CESS-ESP Corpus (simplified)":
from flask import Flask, render_template, request from flask_cors import CORS from flask_socketio import SocketIO, emit from logger import getlogger import nltk import sys import timeit import urllib.request import json import settings import requests from nltk.corpus import alpino as alp from nltk.tag import UnigramTagger, BigramTagger training_corpus = alp.tagged_sents() unitagger = UnigramTagger(training_corpus) bitagger = BigramTagger(training_corpus, backoff=unitagger) pos_tag = bitagger.tag logger = getlogger(__name__) app = Flask(__name__, template_folder='html/templates', static_folder='html/static') CORS(app) socketio = SocketIO(app) app.debug = False @app.route('/') def index(): return render_template('test4nl.html') def query_pixabay(nouns):