def dutch():
    from collective.classification.data.downloader import\
        downloadNLTKAlpinoCorpus
    downloadNLTKAlpinoCorpus()
    from nltk.corpus import alpino
    alpino_sents = alpino.tagged_sents(simplify_tags=True)
    tagger = BrillTrigramTagger()
    tagger.train(alpino_sents)
    dump(tagger.tagger, "dutch_tagger.pickle")
示例#2
0
 def train_corpus_to_tag():
     """
     Train tagger on Alpino Corpus
     :return: model tagger  <type: 'model'>
     """
     alp_tagged_sent = list(alp.tagged_sents())
     tagger = PerceptronTagger(load=False)
     tagger.train(alp_tagged_sent)
     return tagger
def dutch():
    from collective.classification.data.downloader import\
        downloadNLTKAlpinoCorpus
    downloadNLTKAlpinoCorpus()
    from nltk.corpus import alpino
    alpino_sents = alpino.tagged_sents(simplify_tags=True)
    tagger = BrillTrigramTagger()
    tagger.train(alpino_sents)
    dump(tagger.tagger, "dutch_tagger.pickle")
示例#4
0
 def transform(self, reviews, y=None):
     number_of_adjectives = []
     training_corpus = alp.tagged_sents()
     unitagger = UnigramTagger(training_corpus)
     pos_tag = unitagger.tag
     for review in reviews:
         tokens = re.findall(r"[\w']+|[.,!?;]", review)
         adj = 0
         for token in pos_tag(tokens):
             if token[1] == 'adj':
                 adj += 1
         number_of_adjectives.append([adj])
     return number_of_adjectives
示例#5
0
def select_sents(x):
    return {
        'brown_universal':
        brown.tagged_sents(tagset='universal'),  # Accuracy: 95.12%
        'brown': brown.tagged_sents(),  # Accuracy: 93.66%
        'conll2000_universal':
        conll2000.tagged_sents(tagset='universal'),  # Accuracy: 95.63%
        'conll2000': conll2000.tagged_sents(),  # Accuracy: 94.94%
        'conll2002': conll2002.tagged_sents(),  # Accuracy: 91.53%
        'alpino': alpino.tagged_sents(),  # Accuracy: 88.79%
        'dependency_treebank':
        dependency_treebank.tagged_sents(),  # Accuracy: 90.79%
        'treebank': treebank.tagged_sents(),  # Accuracy: 91.44%
        'indian': indian.tagged_sents(),  # Accuracy: 64.41%
        'else': []  # in case of an unavailable corpus
    }.get(x, 'else')
示例#6
0
    def tagger(self):
        """
        Usage:

        training_corpus = list(alp.tagged_sents())
        tagger = PerceptronTagger(load=True)

        tagger.train(training_corpus)

        #sent = 'NLTK is een goeda taal voor het leren over NLP'.split()

        print(tagger.tag(article_text.split()))
        :return:
        """

        # Load Corpus
        training_corpus = list(alp.tagged_sents())
        tagger = PerceptronTagger(load=True)

        # Build tagger
        tagger.train(training_corpus)

        return tagger.tag(self.string.split())
示例#7
0
def main(file_input):
    test_data = pd.read_csv(str(file_input) + '.csv')
    # test_data = pd.read_csv(str(file_input) + '.csv', index_col='Unnamed: 0')

    print("Loaded .csv file Successfully")

    print("Missing Value Treatment : Start")
    # missing values Treatment
    while test_data.isnull().sum().values.sum() != 0:
        col_with_missing_val = (test_data.isnull().sum()).argmax()
        test_data = test_data[test_data[col_with_missing_val].notnull(
        )]  # drop corresponding rows that has NaN values
        print(col_with_missing_val)

    print("Missing Value Treatment : Stop")
    print("Total Number of Samples:", test_data.shape[0])
    print("Total Number of Features:", test_data.shape[1])

    print("Computing Pattern Transformers: Start")
    # pattern transformers
    pattern_strictlyDigits = "^[0-9]*$"
    test_data["strictly_Digits"] = test_data["candidate"].str.contains(
        pattern_strictlyDigits, regex=True).astype(np.int64)
    test_data["Number_of_Digits"] = test_data['candidate'].apply(
        lambda x: len(re.sub("\W", "", x)))
    test_data["Number_of_Seprators"] = test_data['candidate'].apply(
        lambda x: len(re.sub("\w", "", x)))
    test_data["Length_of_Candidate"] = test_data['candidate'].apply(
        lambda x: len(x))

    print("Computing Pattern Transformers: Stop")
    print("Computing Context Transformers: Start")
    # context transformers
    test_data["Text"] = test_data["line_before"] + test_data[
        "line_at"] + test_data["line_after"]

    def email_match(doc):
        match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc))
        if match != None:
            return 1
        else:
            return 0

    test_data["Number_of_Characters_Text"] = test_data["Text"].apply(
        lambda x: len(re.sub("[^a-z]", "", str(x))))
    test_data["Number_of_Digits_Text"] = test_data["Text"].apply(
        lambda x: len(re.sub("[^0-9]+", "", str(x))))
    test_data["Number_of_Separators_Text"] = test_data["Text"].apply(
        lambda x: len((re.sub("[\w]+", "", str(x))).replace(" ", "")))
    test_data["Email_Exists"] = test_data["Text"].apply(
        email_match)  # place 1 everywhere email found else 0
    test_data["Number_of_spaces"] = test_data["Text"].apply(
        lambda x: str(x).count(' '))  # counts number of spaces

    # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language
    ss = SnowballStemmer("dutch", "french")

    def clean_data(doc):
        ignore = list(set(stopwords.words(
            'dutch', 'french')))  # ignore the list of stopwords
        exl_chars = list(set(string.punctuation))
        exl_chars.append('€')
        doc = re.sub(
            "[\w\.-]+@[\w\.-]+", " ", str(doc)
        )  # remove email ids to avoid confiltcs in vaocabulary construction
        doc = re.sub("\d", " ", str(doc))
        doc = ''.join([ch for ch in doc if ch not in exl_chars])
        words = []
        for i in word_tokenize(doc):  # tokenization
            if i not in ignore:
                if len(i) >= 2:  # standalone letters do not add any value
                    i = ss.stem(i)
                    words.append(i)
        doc = ' '.join(list(set(words)))
        return doc

    test_data["Text"] = test_data["Text"].apply(
        clean_data)  # tokenize, stem and lammetize

    # training_corpus = alp.tagged_sents()
    alp_tagged_sent = list(alp.tagged_sents())
    tagger = PerceptronTagger(load=False)
    tagger.train(alp_tagged_sent)

    def count_adj(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_adj_adv = counts['adv'] + counts['adj']
        return count_adj_adv

    def count_nn(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_nn = counts['noun']
        return count_nn

    def count_verb(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_verb = counts['verb']
        return count_verb

    test_data["Adv_Adj_Count"] = test_data["Text"].apply(count_adj)
    test_data["NN_count"] = test_data["Text"].apply(count_nn)
    test_data["Verb_count"] = test_data["Text"].apply(count_verb)

    print("Computing Context Transformers: Stop")
    # load the vocabulary
    with open("vocab.txt", "rb") as fp:
        vocabulary = pickle.load(fp)

    print("Computing Bag of Words Vectors: Start")

    def build_features(doc):
        vector = np.zeros((1, len(vocabulary)), dtype=np.int64)
        for w in word_tokenize(doc):
            for i, word in enumerate(vocabulary):
                if word == w:
                    vector[0][i] += 1
        return vector

    bag_vectors = test_data["Text"].apply(build_features)
    feature_vectors = np.zeros((test_data.shape[0], len(vocabulary)),
                               dtype=np.int64)
    for pos, index in enumerate(test_data.index.values):
        feature_vectors[pos, :] = bag_vectors[index]
    cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))]
    for col_index, col in enumerate(cols):
        test_data[col] = feature_vectors[:, col_index].reshape(
            test_data.shape[0], 1)

    print("Computing Bag of Words Vectors: Stop")

    print("Computing Location Transformers: Start")

    test_data["location_page_nr"] = test_data["page_nr"].apply(
        lambda x: 100 if x >= 50 else x)
    test_data["location_line_nr"] = test_data["line_nr"].apply(
        lambda x: 100 if x >= 50 else x)

    print("Computing Location Transformers: Stop")

    print("Loading Model...")
    model = tf.keras.models.load_model('model_candidate_filter.h5')
    model.compile(loss=tf.keras.losses.mean_squared_error,
                  optimizer='adam',
                  metrics=['accuracy'])
    print("Loaded Model Successfully!")

    X_test = test_data.drop([
        "candidate", "Text", "label", "line_after", "line_at", "line_before",
        "page_nr", "line_nr"
    ],
                            axis=1)

    X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0)
    yHat_proba = model.predict(X_test)
    yHat = np.copy(yHat_proba)
    yHat[yHat <= 0.5] = 0
    yHat[yHat > 0.5] = 1

    print("Storing Results in .csv file")

    confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1]))
    for i in range(0, yHat_proba.shape[0]):
        if yHat_proba[i] <= 0.5:
            confidence[i] = 1 - yHat_proba[i]
        else:
            confidence[i] = yHat_proba[i]

    results_data_frame = pd.DataFrame(
        columns=["Predictions", "Confidence Level"], index=test_data.index)
    results_data_frame["Predictions"] = yHat.astype(np.int64).ravel()
    results_data_frame["Confidence Level"] = np.around(confidence, decimals=4)
    results_data_frame.to_csv("Results_predictions_confidence_run.csv",
                              encoding='utf-8',
                              header=True,
                              index=True)
def main():

    training_corpus = list(alp.tagged_sents())
    global tagger
    tagger = PerceptronTagger()
    tagger.train(training_corpus)
    num = 2138
    dic = {}

    Xtrain = []
    Ytrain = []
    with open("trainGxG/GxG_News.txt") as txt:
        for line in txt:
            if line[0:8] == "<doc id=":
                Ytrain.append(line.split()[3][8])
                string=[line.split('\"')[1]]
                dic[line.split('\"')[1]] = line.split()[3][8]
            elif line[0:6] == "</doc>":
                Xtrain.append(" ".join(string))
            else:
                string.append(line)

    Xtest = []
    with open("testGxG/GxG_News.txt") as txt:
        for line in txt:
            if line[0:8] == "<doc id=":
                string=[]
            elif "</doc>" in line:
                Xtest.append(" ".join(string))
            else:
                string.append(line)

    Ytest = []
    with open("testGxG/GxG_News_gold.txt") as text:
        for line in text:
            Ytest.append(line.split()[1])

    sentences = []
    for i in Xtrain[:num]:
        sentences.append(preprocess(i))


    nlp = spacy.load('nl_core_news_sm')
    veclist = []

    for sentence in sentences:
        doc = nlp(sentence)
        vec = doc.vector 
        veclist.append(vec)

    X = np.array(veclist)

    clf = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None)
    labels = clf.fit_predict(X)
    pca = PCA(n_components=2).fit(X)
    coords = pca.transform(X)

    lst = []

    for index, sentence in enumerate(sentences):
        plt.text(coords[index].tolist()[0],coords[index].tolist()[1], str(dic[sentence.split()[0]]) + str(labels[index]) + ":" + str(sentence)[0:10], fontsize=4)
        lst.append(str(dic[sentence.split()[0]]) + str(labels[index]))

    label_colors=["red", "blue", "green", "yellow", "black", "purple", "cyan"]
    colors = [label_colors[i] for i in labels]
    plt.scatter(coords[:, 0], coords[:, 1], c=colors)
    centroids = clf.cluster_centers_
    centroid_coords = pca.transform(centroids)
    plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker="X", s=200, linewidth=2, c="#444d61")

    print(Counter(labels))

    genders = []
    for i,j in enumerate(sentences):
        if i < num:
            genders.append(dic[j.split()[0]])
    print(Counter(genders))
    print(Counter(lst))
    plt.show()
示例#9
0
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', simplify_tags=True),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(simplify_tags=True),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(simplify_tags=True),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(simplify_tags=True),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(simplify_tags=True),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(simplify_tags=True),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
     lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
     lambda: mac_morpho.tagged_sents(simplify_tags=True),
 'Spanish: CESS-ESP Corpus (simplified)':
示例#10
0
import numpy as np
import torch
from torch.autograd import Variable
import pickle
from collections import Counter
from torch import nn
import torch.nn.functional as F
from nltk.tag import PerceptronTagger
from nltk.corpus import alpino as alp
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import PunktSentenceTokenizer
training_corpus = list(alp.tagged_sents())
tagger = PerceptronTagger(load=True)
tagger.train(training_corpus)
wordTokenizer = WordPunctTokenizer()
sentTokenizer = PunktSentenceTokenizer()


def generate_vocabulary(data, vocabulary_size):
    all_data = " ".join(data)
    print(all_data[:100])
    words = [
        word for sent in sentTokenizer.tokenize(all_data)
        for word in wordTokenizer.tokenize(sent)
    ]
    counter = Counter(words)

    # most_common() produces k frequently encountered
    # input values and their respective counts.
    most_common = counter.most_common(vocabulary_size)
    vocabulary = set([word for word, count in most_common])
示例#11
0
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', simplify_tags=True),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(simplify_tags=True),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(simplify_tags=True),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(simplify_tags=True),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(simplify_tags=True),
 'Hindi: Indian Languages Corpus':
 lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
 lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
 'Portuguese: Floresta Corpus (Portugal)':
 lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
 lambda: floresta.tagged_sents(simplify_tags=True),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
 lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
 lambda: mac_morpho.tagged_sents(simplify_tags=True),
 'Spanish: CESS-ESP Corpus (simplified)':
示例#12
0
文件: train.py 项目: u4ece10128/Tasks
def main(file_input):
    data_df = pd.read_csv(str(file_input) + '.csv')
    data_df = shuffle(data_df)

    print("Loaded .csv file Successfully")

    print("Total Number of Samples:", data_df.shape[0])
    print("Total Number of Features:", data_df.shape[1])

    # Missing Values
    # column with maximum missing values

    def missing_value(data_df):
        while data_df.isnull().sum().values.sum() != 0:
            col_with_missing_val = (data_df.isnull().sum()).argmax()
            data_df = data_df[data_df[col_with_missing_val].notnull(
            )]  # drop corresponding rows that has NaN values
            print("Missing Values in Features:", col_with_missing_val)
        return data_df

    #  Missing Value Treatment:
    print("Missing Value Treatment : Start")
    data_df = missing_value(data_df)
    print("Missing Value Treatment : Stop")
    print("Total Number of Samples:", data_df.shape[0])
    print("Total Number of Features:", data_df.shape[1])

    # pattern matcher for candidate feature
    #  newly Added Features : Dates format, currency format, number of digits per candidate, number of separators
    # per candidate
    print("Computing Pattern Transformers: Start")

    pattern_strictlyDigits = "^[0-9]*$"
    pattern_endWithCharacters = "^\d*[\/.,@$!)(]$"  # Only digits + end with special characters
    pattern_telephone = "^0[0-9]{12}$"
    pattern_vat = "^0?[0-9]{9}$"
    pattern_date = '^[0-3]?[0-9](\/|\,|\.|\-){1}[0-9]?[0-9](\/|\,|\.|\-){1}[0-2][0-9]{1,3}$'

    pattern_currency_1 = '^[0-9]\.[0-9]+\,[0-9]*$'  # captures ddddd,dddd
    pattern_currency_2 = '^[0-9]+\,[0-9]+$'
    data_df['currency_filter'] = data_df['candidate'].str.contains(pattern_currency_1, regex=True).astype(np.int64)\
                                 | data_df['candidate'].str.contains(pattern_currency_2, regex=True).astype(np.int64)

    data_df['dates_filter'] = data_df['candidate'].str.contains(
        pattern_date, regex=True).astype(np.int64)
    data_df["Is_strictly_Digits"] = data_df["candidate"].str.contains(
        pattern_strictlyDigits, regex=True).astype(np.int64)
    data_df["endWithCharacters"] = data_df["candidate"].str.contains(
        pattern_endWithCharacters, regex=True).astype(np.int64)
    data_df["Number_of_Digits"] = data_df['candidate'].apply(
        lambda x: len(re.sub("\W", "", x)))
    data_df["Number_of_Separators"] = data_df['candidate'].apply(
        lambda x: len(re.sub("\w", "", x)))
    data_df["Length_of_Candidate"] = data_df['candidate'].apply(
        lambda x: len(x))

    # included the country code
    data_df["Telephone"] = data_df["candidate"].str.contains(
        pattern_telephone, regex=True).astype(np.int64)
    # VAT number contains 9 to 10 digits
    data_df["VATNumber"] = data_df["candidate"].str.contains(
        pattern_vat, regex=True).astype(np.int64)

    # drop blacklisted variables
    dates_index = data_df.index[data_df['dates_filter'] == 1].tolist()
    data_df = data_df.drop(index=dates_index, axis=0)
    data_df = data_df.drop("dates_filter", axis=1)
    currency_index = data_df.index[data_df['currency_filter'] == 1].tolist()
    data_df = data_df.drop(index=currency_index, axis=0)
    data_df = data_df.drop(["currency_filter"], axis=1)
    telephone_index = data_df.index[data_df['Telephone'] == 1].tolist()
    data_df = data_df.drop(index=telephone_index, axis=0)
    data_df = data_df.drop(["Telephone"], axis=1)
    vat_index = data_df.index[data_df['VATNumber'] == 1].tolist()
    data_df = data_df.drop(index=vat_index, axis=0)
    data_df = data_df.drop(["VATNumber"], axis=1)
    vat_index = data_df.index[data_df['endWithCharacters'] == 1].tolist()
    data_df = data_df.drop(index=vat_index, axis=0)
    data_df = data_df.drop(["endWithCharacters"], axis=1)

    print("Computing Pattern Transformers: Stop")

    # NLP Techniques:
    # Tokenization, Stemming, lemmatization, Frequency Distribution, Bag of words approach

    # Combine three text columns to single column - This columns contains he full text
    data_df["Text"] = data_df["line_before"] + data_df["line_at"] + data_df[
        "line_after"]

    print("Computing Context Transformers: Start")

    # Context Transformers
    def email_match(doc):
        match = re.search(r'[\w\.-]+@[\w\.-]+', str(doc))
        if match != None:
            return 1
        else:
            return 0

    data_df["Number_of_Characters_Text"] = data_df["Text"].apply(
        lambda x: len(re.sub("[^a-z]", "", str(x))))
    data_df["Number_of_Digits_Text"] = data_df["Text"].apply(
        lambda x: len(re.sub("[^0-9]+", "", str(x))))
    data_df["Number_of_Separators_Text"] = data_df["Text"].apply(lambda x: len(
        (re.sub("[\w]+", "", str(x))).replace(" ", "")))
    data_df["Is_Email_Exists"] = data_df["Text"].apply(
        email_match)  # place 1 everywhere email found else 0
    data_df["Number_of_spaces"] = data_df["Text"].apply(
        lambda x: str(x).count(' '))  # counts number of spaces,

    # Clean Data - Tokenization, Stop word check, Size filter, Stemming - Dutch Language
    ss = SnowballStemmer("dutch", "french")

    def clean_data(doc):
        ignore = list(set(stopwords.words(
            'dutch', 'french')))  # ignore the list of stopwords
        exl_chars = list(set(string.punctuation))
        exl_chars.append('€')
        # remove email ids to avoid conflicts in vocabulary construction
        doc = re.sub("[\w\.-]+@[\w\.-]+", " ", str(doc))
        doc = re.sub("\d", " ", str(doc))
        doc = ''.join([ch for ch in doc if ch not in exl_chars])
        words = []
        for i in word_tokenize(doc):  # tokenization
            if i not in ignore:
                if len(i) >= 2:  # standalone letters do not add any value
                    i = ss.stem(i)
                    words.append(i)
        doc = ' '.join(list(set(words)))
        return doc

    print("Cleaning Text Data: Start")
    data_df["Text"] = data_df["Text"].apply(
        clean_data)  # tokenize, stem and lammetize
    print("Cleaning Text Data: Stop")

    print("Computing POS Vectors: Start")

    # training_corpus = alp.tagged_sents()
    alp_tagged_sent = list(alp.tagged_sents())
    tagger = PerceptronTagger(load=False)
    tagger.train(alp_tagged_sent)

    def count_adj(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_adj_adv = counts['adv'] + counts['adj']
        return count_adj_adv

    def count_nn(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_nn = counts['noun']
        return count_nn

    def count_verb(doc):
        tags = tagger.tag(doc.split())
        for tup in tags:
            first_3_characters = tup[0][:3]
            last_3_characters = tup[0][3:]
            if len(tags[0]) >= 3 and first_3_characters[
                    0] == first_3_characters[1] == first_3_characters[2]:
                tags.remove(tup)
            if len(tags[0]) >= 3 and last_3_characters[0] == last_3_characters[
                    1] == last_3_characters[2]:
                tags.remove(tup)
        counts = Counter(tag for word, tag in tags)
        count_verb = counts['verb']
        return count_verb

    data_df["Adv_Adj_Count"] = data_df["Text"].apply(count_adj)
    data_df["NN_count"] = data_df["Text"].apply(count_nn)
    data_df["Verb_count"] = data_df["Text"].apply(count_verb)

    print("Computing POS Vectors: Stop")

    print("Computing Vocabulary: Start")

    # store all the words in positive class and negative in two separate lists
    docs_pos = []

    docs_pos.extend(
        word_tokenize(words) for words in data_df.Text[data_df.gold == 1])

    docs_pos = list(itertools.chain(*docs_pos))

    # Clean text data - remove words like --- iiiiiii, hhhhhccchhhh, abvwwwwwcgdccc
    for i in docs_pos:
        first_3_characters = i[:3]
        last_3_characters = i[-3:]
        if len(i) >= 3 and first_3_characters[0] == first_3_characters[
                1] == first_3_characters[2]:
            docs_pos.remove(i)
        if i in docs_pos and len(i) >= 3 and last_3_characters[
                0] == last_3_characters[1] == last_3_characters[2]:
            docs_pos.remove(i)

    print("Positve class words are stored successfully")

    all_words_pos = nltk.FreqDist(docs_pos)

    print("Computing vocabulary based on Positive Class")
    # find popular words, popular equals more than 25 times in the corpus
    popular_pos_words = []
    for i in all_words_pos.items():
        if i[1] >= 25:
            popular_pos_words.append(i[0])

    # Filter nouns from the popular positive class words
    tagged_pos_words = tagger.tag(popular_pos_words)
    filtered_tag_pos_words_nouns = []
    for word in tagged_pos_words:
        if word[1] == 'noun':
            filtered_tag_pos_words_nouns.append(word[0])
    vocab_pos = list(set(filtered_tag_pos_words_nouns))
    vocabulary = list(set(vocab_pos))

    # save vocabulary
    with open("vocab.txt", "wb") as fp:
        pickle.dump(vocabulary, fp)

    print("Computing Vocabulary: Stop")

    print("Length of Vocabulary: ", len(vocabulary))

    print("Computing Bag of Words Vectors: Start")

    def build_features(doc):
        vector = np.zeros((1, len(vocabulary)), dtype=np.int64)
        for w in word_tokenize(doc):
            for idx, vocab in enumerate(vocabulary):
                if vocab == w:
                    vector[0][idx] += 1
        return vector

    bag_vectors = data_df["Text"].apply(build_features)

    feature_vectors = np.zeros((data_df.shape[0], len(vocabulary)),
                               dtype=np.int64)
    for pos, index in enumerate(data_df.index.values):
        feature_vectors[pos, :] = bag_vectors[index]

    cols = ["BOW_" + str(col) for col in range(0, len(vocabulary))]
    for col_index, col in enumerate(cols):
        data_df[col] = feature_vectors[:,
                                       col_index].reshape(data_df.shape[0], 1)

    print("Computing Bag of Words Vectors: Stop")

    print("Computing Context Transformers: Stop")

    print("Computing Location Transformers: Start")

    data_df["location_page_nr"] = data_df["page_nr"].apply(lambda x: 100
                                                           if x >= 50 else x)
    data_df["location_line_nr"] = data_df["line_nr"].apply(lambda x: 100
                                                           if x >= 50 else x)

    print("Computing Location Transformers: Stop")

    print("Total Number of Newly Added Features:", data_df.shape[1] - 7)

    print("Building ML - Neural Network Model: Start")

    X = data_df.drop([
        "candidate", "Text", "gold", "label", "line_after", "line_at",
        "line_before", "line_nr", "page_nr"
    ],
                     axis=1)
    y = data_df.gold
    #  Normalisation
    X = (X - X.mean(axis=0)) / X.std(axis=0)

    def build_model(input_shape):
        model = Sequential()
        model.add(Dense(1024, input_shape=(input_shape, )))
        model.add(Activation('sigmoid'))

        model.add(Dense(512))
        model.add(Activation('sigmoid'))

        model.add(Dense(128))
        model.add(Activation('sigmoid'))

        model.add(Dense(1, activation="sigmoid"))

        model.compile(optimizer='adam',
                      loss=tf.keras.losses.mean_squared_error,
                      metrics=['accuracy'])
        return model

    #  Stratified k-Fold
    k_fold_outer = model_selection.StratifiedKFold(n_splits=5)
    scores = []
    split = 0
    for train_index, test_index in k_fold_outer.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        model = build_model(X_train.shape[1])
        history = model.fit(X_train,
                            y_train,
                            epochs=5,
                            batch_size=1024,
                            verbose=1)
        results = model.evaluate(X_val, y_val)
        scores.append(results[1])
        split += 1
        del model, history, results

    model = build_model(X.shape[1])
    model.fit(X, y, verbose=0)

    print('Saving the Model *.h5...')
    model.save('model_candidate_filter.h5')

    yHat_proba = model.predict(X)
    yHat = np.copy(yHat_proba)
    yHat[yHat <= 0.5] = 0
    yHat[yHat > 0.5] = 1

    br_score = np.around(metrics.brier_score_loss(y, yHat_proba, pos_label=1),
                         decimals=5)
    print("Storing Results in .csv file")

    confidence = np.zeros((yHat_proba.shape[0], yHat_proba.shape[1]))
    for i in range(0, yHat_proba.shape[0]):
        if yHat_proba[i] <= 0.5:
            confidence[i] = 1 - yHat_proba[i]
        else:
            confidence[i] = yHat_proba[i]

    results_data_frame = pd.DataFrame(
        columns=["Predictions", "Confidence Level"], index=data_df.index)
    results_data_frame["Predictions"] = yHat.astype(np.int64).ravel()
    results_data_frame["Confidence Level"] = np.around(confidence, decimals=4)
    results_data_frame.to_csv("Results_predictions_confidence_train.csv",
                              encoding='utf-8',
                              header=True,
                              index=True)

    return np.mean(scores), br_score
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', tagset='simple'),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(tagset='simple'),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(tagset='simple'),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(tagset='simple'),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(tagset='simple'),
 'Hindi: Indian Languages Corpus':
 lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
 lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
 'Portuguese: Floresta Corpus (Portugal)':
 lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
 lambda: floresta.tagged_sents(tagset='simple'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
 lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
 lambda: mac_morpho.tagged_sents(tagset='simple'),
 'Spanish: CESS-ESP Corpus (simplified)':
示例#14
0
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', tagset='universal'),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(tagset='universal'),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(tagset='universal'),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(tagset='universal'),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(tagset='universal'),
 'Hindi: Indian Languages Corpus':
 lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
 lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
 'Portuguese: Floresta Corpus (Portugal)':
 lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
 lambda: floresta.tagged_sents(tagset='universal'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
 lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
 lambda: mac_morpho.tagged_sents(tagset='universal'),
 'Spanish: CESS-ESP Corpus (simplified)':
示例#15
0
from nltk.corpus import brown, conll2000, alpino, floresta, gutenberg

from nltk.tag import hmm
from nltk.util import unique_list
from nltk.probability import *
from nltk import ConditionalProbDist
from nltk import ConditionalFreqDist
from collections import Counter

from HMM import *

# Load the Training and Test Sentences
print("Downloading Training Sentences from Corpus")
trainingSentences_brown = brown.tagged_sents(tagset="universal")[:10000]
trainingSentences_conll2000 = conll2000.tagged_sents()[:10000]
trainingSentences_alpino = alpino.tagged_sents()[:10000]
trainingSentences_floresta = floresta.tagged_sents()[:10000]
print "Done!"

print("Downloading Test Sentences from Corpus")
testSentences_brown = brown.tagged_sents(tagset="universal")[10000:10500]
testSentences_conll2000 = conll2000.tagged_sents()[10000:10500]
testSentences_alpino = alpino.tagged_sents()[10000:10500]
testSentences_floresta = floresta.tagged_sents()[10000:10500]
print "Done!"


# Extracts words and tags from Sentences
def extractWords_and_Tags(sentences):
    words = {}
    tags = {}
示例#16
0
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='simple'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='simple'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='simple'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='simple'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='simple'),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(tagset='simple'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
     lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
     lambda: mac_morpho.tagged_sents(tagset='simple'),
 'Spanish: CESS-ESP Corpus (simplified)':
示例#17
0
 def __init__(self):
     super(CountAdjectives, self).__init__()
     self.tagger = PerceptronTagger(load=True)
     training_corpus = list(alpino.tagged_sents())
     self.tagger.train(training_corpus)
示例#18
0
import nltk
from nltk.corpus import alpino as alp
from nltk.tag import UnigramTagger, BigramTagger

alpino = alp.tagged_sents()
unitagger = UnigramTagger(alpino)
bitagger = BigramTagger(alpino, backoff=unitagger)
pos_tag = bitagger.tag
sent = 'Een telescoop is een instrument dat een astronoom gebruikt .'.split()
print(pos_tag(sent))

示例#19
0
 "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
     categories="humor", tagset="universal"
 ),
 "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
 "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
     tagset="universal"
 ),
 "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
 "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
     tagset="universal"
 ),
 "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
 "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
     tagset="universal"
 ),
 "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
 "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
     tagset="universal"
 ),
 "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
 "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
     files="hindi.pos", tagset="universal"
 ),
 "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
 "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
     tagset="universal"
 ),
 "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
 "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
     tagset="universal"
 ),
示例#20
0
        categories=["news", "editorial", "reviews"], tagset="simple"
    ),
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="simple"
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='universal'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='universal'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='universal'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='universal'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='universal'),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(tagset='universal'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
     lambda: mac_morpho.tagged_sents(),
 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
     lambda: mac_morpho.tagged_sents(tagset='universal'),
 'Spanish: CESS-ESP Corpus (simplified)':
示例#22
0
def train_dutch_tagger():
    training_corpus = alp.tagged_sents()
    unitagger = UnigramTagger(training_corpus)
    bitagger = BigramTagger(training_corpus, backoff=unitagger)
    pos_tag = bitagger.tag
    return pos_tag
示例#23
0
 "English: Brown Corpus (Humor, simplified)":
 lambda: brown.tagged_sents(categories="humor", tagset="universal"),
 "English: NPS Chat Corpus":
 lambda: nps_chat.tagged_posts(),
 "English: NPS Chat Corpus (simplified)":
 lambda: nps_chat.tagged_posts(tagset="universal"),
 "English: Wall Street Journal Corpus":
 lambda: treebank.tagged_sents(),
 "English: Wall Street Journal Corpus (simplified)":
 lambda: treebank.tagged_sents(tagset="universal"),
 "Chinese: Sinica Corpus":
 lambda: sinica_treebank.tagged_sents(),
 "Chinese: Sinica Corpus (simplified)":
 lambda: sinica_treebank.tagged_sents(tagset="universal"),
 "Dutch: Alpino Corpus":
 lambda: alpino.tagged_sents(),
 "Dutch: Alpino Corpus (simplified)":
 lambda: alpino.tagged_sents(tagset="universal"),
 "Hindi: Indian Languages Corpus":
 lambda: indian.tagged_sents(files="hindi.pos"),
 "Hindi: Indian Languages Corpus (simplified)":
 lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"),
 "Portuguese: Floresta Corpus (Portugal)":
 lambda: floresta.tagged_sents(),
 "Portuguese: Floresta Corpus (Portugal, simplified)":
 lambda: floresta.tagged_sents(tagset="universal"),
 "Portuguese: MAC-MORPHO Corpus (Brazil)":
 lambda: mac_morpho.tagged_sents(),
 "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)":
 lambda: mac_morpho.tagged_sents(tagset="universal"),
 "Spanish: CESS-ESP Corpus (simplified)":
示例#24
0
from flask import Flask, render_template, request
from flask_cors import CORS
from flask_socketio import SocketIO, emit
from logger import getlogger
import nltk
import sys
import timeit
import urllib.request
import json
import settings
import requests
from nltk.corpus import alpino as alp
from nltk.tag import UnigramTagger, BigramTagger

training_corpus = alp.tagged_sents()
unitagger = UnigramTagger(training_corpus)
bitagger = BigramTagger(training_corpus, backoff=unitagger)
pos_tag = bitagger.tag
logger = getlogger(__name__)

app = Flask(__name__, template_folder='html/templates', static_folder='html/static')
CORS(app)
socketio = SocketIO(app)
app.debug = False

@app.route('/')

def index():
    return render_template('test4nl.html')

def query_pixabay(nouns):