Exemplo n.º 1
0
	def getStemmedParagraph(self,text,headersize):
		stemmer = GreekStemmer()
		words = []
		for word in self.removeStopWords(self.preprocessText(text))[:headersize].split():
			stem = stemmer.stem(word)
			words.append(stem)
		return " ".join(words)
Exemplo n.º 2
0
def test_stem_examples():
    gs = GreekStemmer()
    words = []
    with open('tests/fixtures/examples.yml', 'r') as f:
        words = yaml.load(f.read())

    for word, st in words.items():
        assert gs.stem(word) == st
    def __init__(self, organisations_file, text_preprocessor, ratio,
                 headersize):
        stemmer = GreekStemmer()
        v = CountVectorizer(ngram_range=(1, 2), lowercase=False)
        stemmed_organisations = []
        freq_organisations = {}
        with open(organisations_file) as fp:
            pat = re.compile("[^\w\.]+")
            for cnt, line in enumerate(fp):
                #print(line)
                l = []
                clean_line = ' '.join(pat.split(line.replace('"', '')))
                if clean_line != "":
                    for w in clean_line.split():
                        stem = stemmer.stem(w)
                        l.append(stem.upper()
                                 )  # create upper case stemmed organisations
                    organisation = text_preprocessor.getCleanText(
                        " ".join(l), headersize)  # create one string
                    wordgrams = v.fit(list([organisation])).vocabulary_.keys()
                    for wgram in wordgrams:
                        if wgram in freq_organisations:
                            freq_organisations[
                                wgram] = freq_organisations[wgram] + 1
                        else:
                            freq_organisations[wgram] = 1
                    stemmed_organisations.append(
                        organisation
                    )  # insert it to a list with all organisations

        temp_df = pd.DataFrame(list(freq_organisations.items()),
                               columns=['stems', 'freq'])

        selected_df = temp_df[temp_df['freq'] /
                              len(stemmed_organisations) > ratio]

        maxvalue = selected_df['freq'].max()
        meanvalue = selected_df['freq'].mean()

        most_freq = selected_df[selected_df.freq > int(meanvalue)]

        freq_stems = selected_df['stems'].values.tolist()
        freqstemscopy = []
        for s in freq_stems:
            if not text_preprocessor.hasNumbers(s) and len(s) > 3:
                freqstemscopy.append(s)

        self.org_trie = ts.TrieSearch(freqstemscopy)
        self.text_preprocessor = text_preprocessor
        self.headersize = headersize
 def stemmer_clean(entities):
     print(entities)
     stemmer = GreekStemmer()
     file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              'stemmer_exclude_words.txt')
     with open(file_path, 'r') as file_:
         exclude_list = file_.read().splitlines()
     stemmed_excl_list = [
         stemmer.stem(prepair_word(x).upper()) for x in exclude_list
     ]
     clean_entities = [
         x for x in entities if stemmer.stem(prepair_word(x[1]).upper())
         not in stemmed_excl_list
     ]
     return clean_entities
Exemplo n.º 5
0
    def text_stemming(self):
        """
        stem the text
        """
        if self.language == "french":
            stemmer = FrenchStemmer()
        elif self.language == "english":
            stemmer = PorterStemmer()
        elif self.language == "italian":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "german":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "spanish":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "dutch":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "portuguese":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "danish":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "greek":
            stemmer = GreekStemmer()
        elif self.language == "arabic":
            stemmer = ISRIStemmer()
        else:
            print(
                "Language need to be french, english, german,spanish or italian"
            )

        self.text = ' '.join(
            [stemmer.stem(word) for word in word_tokenize(self.text)])
Exemplo n.º 6
0
def tweet_preprocessing(df, stemming, lemmatization):

    # STOPWORDS
    stopwords_greek = set(stopwords.words('greek'))

    print(stopwords_greek)
    sw_list = []
    for sw in stopwords_greek:
        sw_clear = remove_intonation(sw)
        sw_list.append(sw_clear)

    print(set(sw_list))

    stopwords_greek = import_additional_greek_stopwords(stopwords_greek)

    # STEMMER
    stemmer = GreekStemmer()

    # LEMMATIZATION
    corpus_importer = CorpusImporter('greek')
    # corpus_importer.import_corpus('greek_models_cltk')
    # corpus_importer.import_corpus('greek_training_set_sentence_cltk')
    # print(corpus_importer.list_corpora)

    # lemmatizer = LemmaReplacer('greek')
    spacy_lemmatizer = spacy.load('el_core_news_lg')

    list_of_tweets = []

    positive_list, negative_list = get_pos_neg_lists(stemmer)
    print(positive_list)
    print(negative_list)
    pos_score_list = []
    neg_score_list = []
    for i in range(0, len(df)):

        tweet = str(df.iloc[i]['text'])
        # print(tweet)

        # remove accentuation
        tweet = remove_intonation(tweet)

        # tokenization
        tokens = tweet.split()
        tokens = drop_single_chars(tokens)

        # remove very small or very large tweets
        # if len(tokens) > 36 or len(tokens) < 3:
        #     df.loc[i, 'label'] = 10

        # p_score, n_score = count_pos_neg_score(tokens, positive_list, negative_list)

        # remove stopwords
        if stemming is True:
            words = [
                stemmer.stem(w.upper()) for w in tokens
                if w not in stopwords_greek
            ]
            tweet_clean = ' '.join([w for w in words if len(w) > 1])
            tweet_clean = tweet_clean.lower()
            p_score, n_score = count_pos_neg_score(words, positive_list,
                                                   negative_list)
            # print(p_score)
            # print(n_score)

        else:
            words = [w for w in tokens if w not in stopwords_greek]
            # if 'ξεφτιλες' in words or 'τσιρκο' in words:
            tweet_clean = ' '.join([w for w in words])
            # p_score, n_score = count_pos_neg_score(words, positive_list, negative_list)
            # print(p_score)
            # print(n_score)

            if lemmatization is True:
                # lemmatization
                tweet_lemmas = spacy_lemmatizer(tweet_clean)
                lemmas = [t.lemma_ for t in tweet_lemmas]
                tweet_clean = ' '.join([l for l in lemmas])

        # print(tweet_clean)
        if len(tweet_clean) >= 1:
            list_of_tweets.append(tweet_clean)
        else:
            list_of_tweets.append(np.nan)

        pos_score_list.append(p_score)
        neg_score_list.append(n_score)

        # df.loc[i]['text'] = tweet_clean

    df['tweet'] = list_of_tweets
    df['#pos'] = pos_score_list
    df['#neg'] = neg_score_list
    return df
Exemplo n.º 7
0
def test_stem_with_non_greek_letters():
    gs = GreekStemmer()
    assert gs.stem(u"englishΟΣ") == u"englishΟΣ"
Exemplo n.º 8
0
    def __init__(self,
                 ignore_pickles=False,
                 strict=False,
                 n_bigrams=0,
                 bigram_min_freq=3):

        self.greek_stemmer = GreekStemmer()

        with open(os.path.join(os.path.dirname(__file__), 'countries.txt'),
                  'r',
                  encoding="utf8") as fp:
            self.countries = set([
                self.greek_stemmer.stem(self.strip_accents(w[:-1]).upper())
                for w in fp.readlines()
            ])

        with open(os.path.join(os.path.dirname(__file__), 'neutral_words.txt'),
                  'r',
                  encoding="utf8") as fp:
            self.neutral_words = set([w[:-1] for w in fp.readlines()])

        # Used to determine wether we should use parsing pickles or parse the files again
        self.strict = strict

        # Used to ignore preprocessing pickles
        self.ignore_pickles = ignore_pickles

        self.transform_model = None

        self.reduction_model = None
        self.reduction_model_type = None

        self.classifier = None
        self.classifier_type = None

        # Dictionary mapping labels to one-hot vectors
        self.label_dict = None

        # IDF calculated from training set
        self.idf = []

        # Dictionary mapping word ids to words
        self.id2word = None

        # Word dictionary mapping words to indexes
        self.word_dict = None

        # Indexes of most discriminating words
        self.selected_words = []

        # Variances of discriminating words
        self.var = np.array([])

        # Variables used for bigram features
        self.use_bigrams = n_bigrams > 0
        self.n_bigrams = n_bigrams

        # Threshold for bigram occurence in texts
        self.bigram_min_freq = bigram_min_freq

        # List of bigram tuples
        self.bigrams = []

        # Scores of most descriptive bigrams
        self.best_bigram_scores = []
        self.bigram_indexes = []

        # Compile regexes that clear text
        self.clear_regexes = [

            # Merges decimal number into a single numerical value
            (re.compile(r'([0-9])(\s*[\.,]\s*)([0-9])'), '\1\3'),

            # Takes care of acronyms
            (re.compile(
                r'(([\w]+)\.)(([\w]+)\.)(([\w]+)\.)?(([\w]+)\.)?(([\w]+)\.)?'),
             '\2\4\6\8\10'),

            # Removes every character that is not a word or a digit
            (re.compile(r'([^\w0-9])+'), ' ')
        ]

        # Constant used in place of country name
        self.COUNTRY_CONST = 'COUNTRYVALUE'

        # Constant used instead of numerical values
        self.NUMBER_CONST = 'NUMBERVALUE'
Exemplo n.º 9
0
class Preprocessor(object):
    def __init__(self,
                 ignore_pickles=False,
                 strict=False,
                 n_bigrams=0,
                 bigram_min_freq=3):

        self.greek_stemmer = GreekStemmer()

        with open(os.path.join(os.path.dirname(__file__), 'countries.txt'),
                  'r',
                  encoding="utf8") as fp:
            self.countries = set([
                self.greek_stemmer.stem(self.strip_accents(w[:-1]).upper())
                for w in fp.readlines()
            ])

        with open(os.path.join(os.path.dirname(__file__), 'neutral_words.txt'),
                  'r',
                  encoding="utf8") as fp:
            self.neutral_words = set([w[:-1] for w in fp.readlines()])

        # Used to determine wether we should use parsing pickles or parse the files again
        self.strict = strict

        # Used to ignore preprocessing pickles
        self.ignore_pickles = ignore_pickles

        self.transform_model = None

        self.reduction_model = None
        self.reduction_model_type = None

        self.classifier = None
        self.classifier_type = None

        # Dictionary mapping labels to one-hot vectors
        self.label_dict = None

        # IDF calculated from training set
        self.idf = []

        # Dictionary mapping word ids to words
        self.id2word = None

        # Word dictionary mapping words to indexes
        self.word_dict = None

        # Indexes of most discriminating words
        self.selected_words = []

        # Variances of discriminating words
        self.var = np.array([])

        # Variables used for bigram features
        self.use_bigrams = n_bigrams > 0
        self.n_bigrams = n_bigrams

        # Threshold for bigram occurence in texts
        self.bigram_min_freq = bigram_min_freq

        # List of bigram tuples
        self.bigrams = []

        # Scores of most descriptive bigrams
        self.best_bigram_scores = []
        self.bigram_indexes = []

        # Compile regexes that clear text
        self.clear_regexes = [

            # Merges decimal number into a single numerical value
            (re.compile(r'([0-9])(\s*[\.,]\s*)([0-9])'), '\1\3'),

            # Takes care of acronyms
            (re.compile(
                r'(([\w]+)\.)(([\w]+)\.)(([\w]+)\.)?(([\w]+)\.)?(([\w]+)\.)?'),
             '\2\4\6\8\10'),

            # Removes every character that is not a word or a digit
            (re.compile(r'([^\w0-9])+'), ' ')
        ]

        # Constant used in place of country name
        self.COUNTRY_CONST = 'COUNTRYVALUE'

        # Constant used instead of numerical values
        self.NUMBER_CONST = 'NUMBERVALUE'

    # Unpickle data from file
    def unpickle_data(self, file):

        data_file = file + '.data'
        meta_file = file + '.metadata'

        if os.path.isfile(meta_file) and os.path.isfile(
                data_file) and not self.ignore_pickles:

            return_val = ()

            with open(meta_file, 'rb') as f:
                metadata = pickle.load(f)
                if isinstance(metadata, list) and len(metadata) > 0:

                    # If we have a numpy array
                    if len(metadata[0]) == 3:
                        dtype, w, h = metadata[0]

                        with open(data_file, 'rb') as fh:
                            return_val = (np.frombuffer(fh.read(),
                                                        dtype=dtype).reshape(
                                                            (int(w),
                                                             int(h))), )

                    if len(metadata) > 1:
                        return_val = return_val + metadata[1]

            if len(return_val) == 1:
                return return_val[0]
            else:
                return return_val

        else:
            return []

    # Pickle data to file
    def pickle_data(self, file, data):

        data_file = file + '.data'
        meta_file = file + '.metadata'

        numpy_matrix = np.array([])
        metadata = []

        if type(data) == np.ndarray:
            numpy_matrix = data
            metadata.append((numpy_matrix.dtype, *numpy_matrix.shape))

        elif isinstance(data, tuple) and len(data) > 0 and type(
                data[0]) == np.ndarray:
            numpy_matrix = data[0]
            metadata.append((numpy_matrix.dtype, *numpy_matrix.shape))
            if len(data) > 1:
                metadata.append(data[1:])
        else:
            metadata.append(())
            metadata.append(data)

        with open(meta_file, 'wb') as f:
            # Pickle the metadata file.
            pickle.dump(metadata, f, pickle.HIGHEST_PROTOCOL)

        # If we have metadata for numpy array, then write
        # the array to disc
        if len(metadata[0]) > 0:
            with open(data_file, 'wb+') as fh:
                fh.write(np.ascontiguousarray(numpy_matrix).data)

    def parse_files(self, dir, only_parse=False, is_train=False):

        is_kfold = isinstance(dir, tuple)
        is_train = is_kfold or is_train

        # List of directories to parse
        directories = list(dir) if is_kfold else [dir]

        # Pickle file for the given directories
        pickle_file = './data/' + "-".join(directories) + '.pickle'

        # If pickles exist and we don't want to ignore them, then load and return
        # preprocessed articles from pickle
        if not (self.ignore_pickles
                and self.strict) and os.path.isfile(pickle_file):
            with open(pickle_file, 'rb') as f:
                # The protocol version used is detected automatically, so we do not
                # have to specify it.
                self.best_bigram_scores, articles, labels = pickle.load(f)
                self.bigrams = set([b for b, s in self.best_bigram_scores])
                return articles, labels

        articles = []
        labels = []

        i = 0

        # Loop through every directory
        for root, dirs, files in chain.from_iterable(
                os.walk('./data/' + dirr) for dirr in directories):

            # Loop through every file
            for name in files:

                link = os.path.join(root, name)

                # Open only .raw files
                if re.search(r'\.raw$', name):

                    # Ignore encoding issues and open file as ISO-8859-7
                    with codecs.open(link,
                                     'r',
                                     encoding='ISO-8859-7',
                                     errors='ignore') as f:

                        # Parse and preprocess file
                        m = re.match(r'^[a-zA-Z]+', name)
                        if m:
                            data = f.read().replace('\n',
                                                    ' ').replace('\x96', ' ')
                            articles.append(
                                data if only_parse else self.preprocess(data))
                            labels.append(m.group(0))
                            i += 1

        best_bigram_scores = []

        # If we want to use bigrams
        if self.use_bigrams:

            # Bigrams are collected from training set only
            if is_train:

                bigram_measures = nltk.collocations.BigramAssocMeasures()
                finder = BigramCollocationFinder.from_documents(articles)

                # Filter bigrams that appear in less than bigram_min_freq texts
                finder.apply_freq_filter(self.bigram_min_freq)

                # Get first n bigrams with highest PMI
                best_bigram_scores = [(b, s) for b, s in finder.score_ngrams(
                    bigram_measures.pmi)[:self.n_bigrams]]
                best_bigrams = [b for b, s in best_bigram_scores]
                self.bigrams = set(best_bigrams)
                self.best_bigram_scores = best_bigram_scores

            # Append the bigrams to our article prepairing for feature extraction phase
            articles = [
                article + [
                    b[0] + " " + b[1]
                    for b in nltk.bigrams(article) if b in self.bigrams
                ] for article in articles
            ]

        if len(articles) != len(labels):
            raise Exception("Couldn't create labels")

        with open(pickle_file, 'wb') as f:
            # Pickle data to file
            pickle.dump((best_bigram_scores, articles, labels), f,
                        pickle.HIGHEST_PROTOCOL)

        return articles, labels

    def preprocess(self, text):

        # Remove accent characters
        text = self.strip_accents(text)

        # Convert to uppercase
        text = text.upper()

        # Tokenize
        words = self.tokenize(text)

        # Stem words and remove neutral words
        words = [
            self.greek_stemmer.stem(w) for w in words
            if w not in self.neutral_words
        ]

        r = re.compile('[0-9]')

        # Replace numerical values with NUMBER_CONST
        words = [self.NUMBER_CONST if bool(r.search(w)) else w for w in words]

        # Replace country names with COUNTRY_COST
        words = [
            self.COUNTRY_CONST if w in self.countries else w for w in words
        ]

        # Remove words with less than 3 letters
        words = [w for w in words if len(w) > 2]

        return words

    def strip_accents(self, s):
        return ''.join(c for c in unicodedata.normalize('NFD', s)
                       if unicodedata.category(c) != 'Mn')

    def tokenize(self, text):

        for regex, replacement in self.clear_regexes:
            text = regex.sub(replacement, text)

        words = text.split(' ')

        return words

    # Used to create a word dictionary
    def create_word_dictionary(self, texts, recreate=True):

        words_dict = {}
        counter = 0

        pickle_file = './data/word_dict'

        # Unpickle dict if possible
        data = self.unpickle_data(pickle_file)
        if len(data) > 0:
            words_dict, counter = data
            self.id2word = {b: a for (a, b) in words_dict.items()}
            self.word_dict = words_dict
            return words_dict

        # Loop through texts and add new words to dictionary
        for text in texts:
            for word in text:
                if word not in words_dict:
                    words_dict[word] = counter
                    counter += 1

        # Pickle dict
        self.pickle_data(pickle_file, (words_dict, counter))

        # Create inverse dict
        self.id2word = {b: a for (a, b) in words_dict.items()}
        self.word_dict = words_dict
        return words_dict

    def create_tfidf_train(self, word_dict, texts, labels, n_dims=3000):

        pickle_file = './data/train/tfidf'
        data = self.unpickle_data(pickle_file)

        if len(data) > 0:
            tfidf, self.var, self.selected_words, self.idf = data
            return tfidf

        # Useful values
        n_words = max(word_dict.values())
        n_documents = len(texts)
        label_set = set(labels)
        n_classes = len(label_set)
        n_features = n_words + 1

        # Matrix used to represent number of occurences of each term in each text
        m = []

        # Loop through texts and create matrix m
        for text in texts:
            word_vec = [0] * (n_words + 1)

            for word in text:

                if word in word_dict:
                    word_vec[word_dict[word]] += 1

            m.append(word_vec)

        m = np.array(m)

        # Create y from labels
        dct = {k: i for (i, k) in enumerate(label_set)}
        y = np.array([dct[i] for i in labels])

        # For every class calculate the TF of every term
        m_class = np.zeros((n_classes, n_features))
        for i in range(n_classes):
            a = np.sum(m[y == i, :], axis=0)
            m_class[i, :] = a / np.sum(a)

        # Find the variance of tf among different classes
        # This is used to extract the terms that differentiate
        # most one class from the other
        var = np.var(m_class, axis=0)

        # Take top terms with largest variance among classes
        self.selected_dims = var.argsort()[-n_dims:][::-1]

        # Create tuples mapping term id to term variance
        self.selected_words = [(self.id2word[id], var[id])
                               for id in self.selected_dims]
        self.var = var

        # Reduced table m to new dimensions
        m_reduced = m[:, self.selected_dims]

        # Create tf matrix
        tft = m_reduced / np.sum(m_reduced, axis=1).reshape((-1, 1))

        # Document frequency
        doc_frequency = np.sum(np.int32(m_reduced > 0), axis=0)

        # Inverse document frequency
        idf = np.log(n_documents / doc_frequency)
        self.idf = idf

        # Calculate IDF
        tfidf = tft * idf

        # self.calc_mutual_information(tfidf,m_reduced)

        # Pickle our data
        self.pickle_data(pickle_file, (tfidf, var, self.selected_words, idf))

        return tfidf

    # Used to calculate mutual information
    def calc_mutual_information(self, tfidf, m, n_dims_reduced=1000):

        m1 = np.int32(m > 0)

        n_documents = m1.shape[0]

        n_dims = m1.shape[1]

        res = [[
            np.NINF if i >= j else np.log(
                n_documents * np.sum(np.int32(m1[:, i] == m1[:, j])) /
                np.sum(np.int32(m[:, i] > 0)) / np.sum(np.int32(m[:, j] > 0)))
            for j in range(m1.shape[1])
        ] for i in range(m1.shape[1])]

        res = np.array(res) / np.log(2)

        res[np.isnan(res)] = np.NINF

        best_pairs = res.reshape(-1).argsort()[-n_dims_reduced:][::-1]

        results = [(a, b)
                   for a, b in zip(best_pairs // n_dims, best_pairs % n_dims)]

        #for tup in results:
        #	print(self.id2word[self.selected_dims[tup[0]]]+"-"+self.id2word[self.selected_dims[tup[1]]])

    def create_tfidf_test(self, word_dict, texts):

        if len(self.idf) == 0:
            raise Exception("You must create training idf first")

        pickle_file = './data/test/tfidf'
        data = self.unpickle_data(pickle_file)

        if len(data) > 0:
            tfidf = data
            return tfidf

        n_words = max(word_dict.values())

        # Matrix used to represent number of occurences of each term in each text
        m = []

        # Loop through texts and extract features
        for text in texts:
            word_vec = np.array([0] * (n_words + 1))

            for word in text:

                if word in word_dict:
                    word_vec[word_dict[word]] += 1

            m.append(word_vec)

        m = np.array(m)

        # Reduce matrix m according to the selected dimensions
        m_reduced = m[:, self.selected_dims]

        # Test set TF matrix
        tft = m_reduced / np.sum(m_reduced, axis=1).reshape((-1, 1))

        # Test set TFIDF matrix
        tfidf = tft * self.idf

        self.pickle_data(pickle_file, tfidf)

        return tfidf

    # Transform data
    def transform_train(self, tfidf, method='entropy', mode='train'):

        self.transform_model = method

        # If a corresponding pickle exists, load data from pickle
        pickle_file = './data/' + mode + '/tranform_' + method
        data = self.unpickle_data(pickle_file)
        if len(data) > 0:
            return data

        l = []

        # Entropy transformation
        if method == 'entropy':

            p = tfidf / np.sum(tfidf, axis=0)
            p[np.isnan(p)] = 1

            e = 1 + np.nan_to_num(
                np.sum(p * np.log(p), axis=0) / np.log(tfidf.shape[0]))
            e[e == -inf] = 0

            l = e * np.log(1 + tfidf)

        # Binary transformation
        elif method == 'binary':
            l = 1 * (tfidf > 0)

        # Logarithmic transformation
        elif method == 'log':
            l = np.log(1 + tfidf)

        # No transformation
        elif method == 'none':
            l = tfidf

        self.pickle_data(pickle_file, l)

        return l

    # Transform test set using the chosen transformation method
    def transform_test(self, tfidf):

        if self.transform_model == None:
            raise Exception("You must train first!")

        return self.transform_train(tfidf, self.transform_model, mode='test')

    # Perform dimensionality reduction
    def reduce_dims_train(self, X, y, method='PCA', **kwargs):

        pickle_file = './data/train/reduced_dims_' + method

        data = self.unpickle_data(pickle_file)

        if len(data) > 0:
            transformed, l_kwargs, transform_model_type, reduction_model, = data
            if transform_model_type == self.transform_model and l_kwargs == kwargs:
                self.reduction_model = reduction_model
                return transformed

        if method == 'PCA':
            self.reduction_model = PCA(**kwargs)

        elif method == 'LDA':
            self.reduction_model = LDA(n_components=50)
            dct = {k: i for (i, k) in enumerate(set(y))}
            y = [dct[i] for i in y]

        else:
            raise Exception("Wrong Method")

        # Fit the reduction model to our data
        self.reduction_model.fit(X, y)

        # Perform dimensionality reduction
        transformed = self.reduction_model.transform(X)

        self.pickle_data(
            pickle_file,
            (transformed, kwargs, self.transform_model, self.reduction_model))

        return transformed

    # Perform dimensionality reduction to the test set using the chosen method
    def reduce_dims_test(self, X):

        if self.reduction_model == None:
            raise Exception("You must train first!")

        return self.reduction_model.transform(X)

    # Perform one-hot encoding to our labels
    def encode_labels(self, y):

        s = set(y)

        n = len(s)

        if self.label_dict == None:
            self.label_dict = {
                l: [1 * (i == l) for l in range(n)]
                for (i, l) in enumerate(s)
            }

        y = np.array([self.label_dict[label] for label in y])

        return y

    # Train model with different parameters and methods
    def train_model(self, X, y, method='KNN', **kwargs):

        self.classifier_type = method
        #85%
        if method == 'KNN':
            # n_neighbors=5, metric='minkowski'
            self.classifier = KNN(**kwargs)
            y = np.argmax(y, axis=1)
            self.classifier.fit(X, y)
        elif method == 'SVM':
            # gamma='scale', decision_function_shape='ovo'
            self.classifier = svm.SVC(**kwargs)
            y = np.argmax(y, axis=1)
            self.classifier.fit(X, y)
        #77%
        elif method == 'NB':
            self.classifier = NB(**kwargs)
            y = np.argmax(y, axis=1)
            self.classifier.fit(X, y)

        elif method == 'GMM':
            # n_components=5, tol=1e-3, max_iter=100, init_params='kmeans'
            self.classifier = GMM(**kwargs)
            y = np.argmax(y, axis=1)
            self.classifier.fit(X, y)
            #pass
        #79%
        elif method == 'RandomForest':
            self.classifier = RFC(**kwargs)
            self.classifier.fit(X, y)

        elif method == 'MEAN':
            self.classifier = MEAN_CLASSIFIER(**kwargs)
            y = np.argmax(y, axis=1)
            self.classifier.fit(X, y)

        elif method == 'ANN':

            self.classifier = Sequential()
            l1, a1 = 50, 'relu'
            l2, a2 = 20, 'relu'
            learning_rate = 0.001
            n_epochs = 50
            b_size = 10

            if 'learning_rate' in kwargs:
                learning_rate = kwargs['learning_rate']

            if 'layers' in kwargs:
                l1, a1 = kwargs['layers'][0]
                l2, a2 = kwargs['layers'][1]

            if 'epochs' in kwargs:
                n_epochs = kwargs['epochs']

            if 'batch_size' in kwargs:
                b_size = kwargs['batch_size']

            self.classifier.add(Dense(l1, input_dim=X.shape[1], activation=a1))
            self.classifier.add(Dense(l2, activation=a2))
            self.classifier.add(Dense(30, activation=a2))
            self.classifier.add(Dense(y.shape[1], activation='softmax'))

            optimizer = optimizers.Adam(lr=learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.999)
            self.classifier.compile(loss=losses.kullback_leibler_divergence,
                                    optimizer=optimizer,
                                    metrics=['accuracy'])
            self.classifier.fit(X, y, epochs=n_epochs, batch_size=b_size)

        elif method == 'CNN':

            n_epochs = 50
            b_size = 10
            learning_rate = 0.001

            if 'learning_rate' in kwargs:
                learning_rate = kwargs['learning_rate']

            if 'epochs' in kwargs:
                n_epochs = kwargs['epochs']

            if 'batch_size' in kwargs:
                b_size = kwargs['batch_size']

            X = np.expand_dims(X, axis=2)

            model = Sequential()
            model.add(
                Convolution1D(nb_filter=128,
                              filter_length=1,
                              input_shape=(X.shape[1], 1)))
            model.add(MaxPooling1D(pool_size=2, strides=None, padding='valid'))
            model.add(Activation('relu'))
            model.add(Flatten())
            model.add(Dropout(0.4))
            model.add(Dense(128, activation='relu'))
            model.add(Dense(64, activation='relu'))
            model.add(Dense(y.shape[1]))
            model.add(Activation('softmax'))

            optimizer = optimizers.Adam(lr=learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.999)
            model.compile(loss=losses.kullback_leibler_divergence,
                          optimizer=optimizer,
                          metrics=['accuracy'])
            model.fit(X, y, epochs=n_epochs, batch_size=b_size)

            self.classifier = model

        else:
            raise Exception("No such classifier exists!")

    # Evaluate our model
    def evaluate_model(self, X, y):

        if self.classifier == None or self.classifier_type == None:
            raise Exception("You must first train the classifier!")

        if self.classifier_type == 'ANN':
            loss, acc = self.classifier.evaluate(X, y)

            return acc
        elif self.classifier_type == 'CNN':
            X = np.expand_dims(X, axis=2)

            loss, acc = self.classifier.evaluate(X, y)

            return acc

        elif self.classifier_type in ['NB', 'GMM', 'SVM', 'KNN', 'MEAN']:
            pred = self.classifier.predict(X)
            y = np.argmax(y, axis=1)
            return np.sum(1 * (pred == y)) / len(y)
        else:
            pred = np.argmax(self.classifier.predict(X), axis=1)
            y = np.argmax(y, axis=1)
            return np.sum(1 * (pred == y)) / y.shape[0]
Exemplo n.º 10
0
    def structure(self, screen_name):
        stemmer = GreekStemmer()
        greece = timezone("Europe/Athens")
        gmt = timezone("GMT")
        self.logger.log_system("Opening extracted tweet file")
        output_file_name = "./output/tweets/%s.json" % screen_name
        output_file = open(output_file_name, "r")
        self.logger.log_system("File opened")
        user = self.session.query(User).filter_by(id=screen_name).first()
        if user is None:
            user = User(id=screen_name)
            self.session.add(user)
            self.session.commit()
        count = 0
        self.logger.log_system("Structuring tweets into database")
        for line in output_file:
            count = count + 1
            tweet = json.loads(line)
            tweet_id = tweet["id"]
            if self.session.query(Tweet).filter_by(
                    id=tweet_id).first() is None:
                # key tweet data upload
                self.logger.log_system(
                    "Structuring tweet #:%d | id:%s | user:%s" %
                    (count, tweet["id"], screen_name))
                is_retweet = True if "retweeted_status" in tweet else False
                is_quote = True if "quoted_status" in tweet else False
                tweet_text = tweet["text"]
                timestamp = datetime.strptime(tweet['created_at'],
                                              '%a %b %d %H:%M:%S +0000 %Y')
                timestamp = gmt.localize(timestamp)
                self.session.add(Tweet(id=tweet_id, \
                                user_id=screen_name, \
                                time_created=timestamp.strftime("%Y-%m-%d %H:%M:%S"), \
                                full_text=tweet_text, \
                                is_retweet=is_retweet, \
                                is_quote=is_quote, \
                                source=tweet["source"], \
                                in_reply_to_screen_name=tweet["in_reply_to_screen_name"]))

                # time conversion to Athens and time segmentation
                timestamp = timestamp.astimezone(greece)
                self.logger.log_system("Time segmenting tweets")
                self.session.add(TweetAnalytics(id = tweet_id, \
                                time_mapped=timestamp.strftime("%Y-%m-%d %H:%M:%S"), \
                                year_mapped=timestamp.strftime("%Y"), \
                                month_mapped=timestamp.strftime("%m"), \
                                day_mapped=timestamp.strftime("%d"), \
                                hour_mapped=timestamp.strftime("%H"), \
                                weekday_mapped=timestamp.strftime("%w"), \
                                time_segment="%s%s" % (timestamp.strftime("%Y")[-2:], timestamp.strftime("%m%d%H"))))

                # generate tokens
                self.logger.log_system("Generating tokens")
                tweet_text = re.sub(r"http\S+", "", tweet_text)  # remove links
                tweet_text = tweet_text.replace("RT @", "@")  # remove quotes
                tweet_text = tweet_text.replace('"', " ")  # remove quotes
                tweet_text = tweet_text.replace(u"\u201D",
                                                " ")  # remove quotes
                tweet_text = tweet_text.replace(u"\u201C",
                                                " ")  # remove quotes
                tweet_text = tweet_text.replace(
                    u"\u0387", " ")  # remove non-sentimental punctuation marks
                tweet_text = tweet_text.replace(
                    ",", " ")  # remove non-sentimental punctuation marks
                tweet_text = tweet_text.replace(
                    ":", " ")  # remove non-sentimental punctuation marks
                tweet_text = tweet_text.replace(
                    ".", " ")  # remove non-sentimental punctuation marks
                tweet_text = tweet_text.replace(
                    "-", " - "
                )  # pad non-sentimental punctuation marks that may be useful in context
                tweet_text = tweet_text.replace(
                    "!", " ! ")  # pad contextual punctuation marks for parsing
                tweet_text = tweet_text.replace(
                    "?", " ? ")  # pad contextual punctuation marks for parsing
                tweet_text = tweet_text.replace(
                    ";", " ; ")  # pad contextual punctuation marks for parsing
                tweet_text = ' '.join(
                    tweet_text.split())  # remove redundant spaces
                raw_tokens = tweet_text.split()
                raw_token_count = 0
                for raw_token in raw_tokens:
                    raw_token_count = raw_token_count + 1
                    token = raw_token
                    token = ''.join(
                        c for c in unicodedata.normalize('NFD', raw_token)
                        if unicodedata.category(c) != 'Mn')  # remove accents
                    token = token.upper()  # all to uppercase
                    token = stemmer.stem(token)  # get the stem
                    dictionary_token_id = ""
                    dictionary_token = self.session.query(
                        DictionaryToken
                    ).filter_by(token=token).first(
                    )  # look up the stem in the sentiment dictionary; bit of a rushed approach, could be done much better
                    if dictionary_token is None:
                        dictionary_token_id = None
                    else:
                        dictionary_token_id = dictionary_token.id
                    self.session.add(
                        TweetToken(tweet_id=tweet_id,
                                   sequence=raw_token_count,
                                   token=token,
                                   raw_token=raw_token,
                                   dictionary_token=dictionary_token_id))
                self.logger.log_system("Tweet processed")
                self.session.commit()
            else:
                self.logger.log_system(
                    "Already structured tweet #:%d | id:%s " %
                    (count, tweet["id"]))
        return output_file
Exemplo n.º 11
0
def evaluate(model_name, k=5):
    with open("preprocess/test_articles_dataset.json", "r") as testF:
        test_articles = json.load(testF)

    if model_name == "rake":
        model = Rake(language_code="el")
    elif model_name == "yake":
        model = yake.KeywordExtractor(lan="el", top=k)
    elif model_name == "textrank":
        pos_el = spacy.load("el_core_news_md")
        tr = pytextrank.TextRank()
        pos_el.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    else:
        nltk.download('stopwords')
        pos_el = spacy.load("el_core_news_md")
        model, tokenizer = load_model(model_name)
    stemmer = GreekStemmer()
    num_predictions = 0
    num_golds = 0
    num_relevant = 0

    for article in tqdm(test_articles):
        doc = article['title'] + " " + article["abstract"]
        gold_keywords = article["keywords"]

        if model_name == "rake":
            pred_keywords = extract_keywords_RAKE(model, doc, top_n=k)
        elif model_name == "yake":
            pred_keywords = extract_keywords_YAKE(model, doc)
        elif model_name == "textrank":
            pred_keywords = extract_keywords_TEXTRANK(pos_el, doc, top_n=k)
        else:
            pred_keywords = extract_keywords(doc,
                                             model,
                                             tokenizer,
                                             pos_el,
                                             top_n=k)

        gold_keywords_prep = []
        for word in gold_keywords:
            _tmp = []
            for token in strip_accents_and_uppercase(word).split():
                _tmp.append(stemmer.stem(token))
            gold_keywords_prep.append(" ".join(_tmp))

        pred_keywords_prep = []

        for word in pred_keywords:
            _tmp = []
            for token in strip_accents_and_uppercase(word).split():
                _tmp.append(stemmer.stem(token))
            pred_keywords_prep.append(" ".join(_tmp))

        num_predictions += len(pred_keywords_prep)
        num_golds += len(gold_keywords_prep)

        rel = 0
        matched_gold_keywords = []
        for pred_word in sorted(pred_keywords_prep):
            broken = False
            for gold_word in sorted(gold_keywords_prep):
                for token in pred_word.split():
                    if token in gold_word and gold_word not in matched_gold_keywords:
                        rel += 1
                        broken = True
                        matched_gold_keywords.append(gold_word)
                        break
                if broken:
                    break

        num_relevant += rel

    precision_at_k = num_relevant / num_predictions
    recall_at_k = num_relevant / num_golds
    f1_at_k = (2 * precision_at_k * recall_at_k) / (precision_at_k +
                                                    recall_at_k)
    print("Precision@{:d}: {:.3f}".format(k, precision_at_k))
    print("Recall@{:d}: {:.3f}".format(k, recall_at_k))
    print("F1@{:d}: {:.3f}".format(k, f1_at_k))