def german_semantic(text):
    from nltk.corpus import stopwords
    from nltk.stem.cistem import Cistem
    stopwords = set(stopwords.words("german"))

    liste = []
    stemmer = Cistem()
    wordlist = []

    # clean up the text
    text = "".join(text.lower())
    text = text.replace('[^\w\s]', '')
    text = re.sub("\s+", " ", text)
    # delete stopwords
    for word in text.split():
        if word not in stopwords:
            liste.append(word)
    text = " ".join(liste)
    # stemmer
    for word in text.split():
        word = stemmer.segment(word)[0]
        wordlist.append(word)
    text = " ".join(wordlist)

    # sentiment
    blob = TextBlobDE(text)
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity

    return sentiment_polarity, sentiment_subjectivity
Пример #2
0
def stem(text):
    if type(text) is str:
        text = tokenize(text)
    stemmer = Cistem()
    for index, word in enumerate(text):
        text[index] = stemmer.stem(word)
    return ' '.join(text)
Пример #3
0
def stem_cistem(x):
    from nltk.stem.cistem import Cistem
    stemmer = Cistem()
    s_text = []
    for word in x:
        s_text.append(stemmer.stem(word))
    s_text = ''.join(s_text)
    return s_text
Пример #4
0
    def stem(string, stemmer="porter", **kwargs):

        if stemmer == "porter":
            impl = PorterStemmer()
        elif stemmer == "lancaster":
            impl = LancasterStemmer()
        elif stemmer == "regex":
            regexp = kwargs['regexp']
            if 'min' in kwargs:
                min = kwargs['min']
            else:
                mins = 0
            impl = RegexpStemmer(regexp=regexp, min=min)
        elif stemmer == "isri":
            impl = ISRIStemmer()
        elif stemmer == "snowball":
            if 'language' in kwargs:
                language = kwargs['language']
            else:
                language = 'english'
            impl = SnowballStemmer(language=language)
        elif stemmer == "rslp":
            impl = RSLPStemmer()
        elif stemmer == "cistem":
            if 'case_insensitive' in kwargs:
                case_insensitive = kwargs['case_insensitive']
            else:
                case_insensitive = False
            impl = Cistem(case_insensitive=case_insensitive)
        else:
            return string

        return impl.stem(string)
def determinar_seguimiento(titulo_principal, titulo_querella):
    stemmer = Cistem()
    regex = r'\b\w+\b'
    palabras_titulo_principal = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_principal.split(" ") if palabra not in stopwords.words('english')]
    palabras_titulo_querella = [stemmer.stem("".join(re.findall(regex, palabra.lower()))) for palabra in titulo_querella.split(" ") if palabra not in stopwords.words('english')]
    """AQUI SE DEBEN INCLUIR CONSULTAS A API'S DE NLP PARA OBTENER ENTIDADES
    Y CONCEPTOS"""
    porcentaje_coincidencia = 0
    for palabra_titulo_querella in palabras_titulo_querella:
        if palabra_titulo_querella in palabras_titulo_principal:
            porcentaje_coincidencia += 1
    porcentaje_coincidencia /= len(palabras_titulo_principal)
    if porcentaje_coincidencia >= 0.4:
        return True
    else:
        return False
Пример #6
0
def build_corpus():
    env_path = Path('../') / '.env'
    load_dotenv(dotenv_path=env_path)
    connection_string = os.getenv("DATABASE_URL")
    mongo_client = MongoClient(connection_string)
    t4g_database = mongo_client.test
    jobs_collection = t4g_database.jobs
    jobs = jobs_collection.find()
    size = jobs.count()
    stemmer = Cistem()
    corpus = []
    ids = []
    for i, job in enumerate(jobs):
        if i % 1000 == 0: print(f'{i}/{size}')
        indices = []
        print(job['_id'])
        title = job['title']
        _id = job['_id']
        ids.append(_id)
        text = job['detailed_activities'].strip()
        text = ' '.join(text.split())
        for index in range(len(text)):
            if text[index].isupper() and index > 1 and text[
                    index - 1] is not " " and text[
                        index - 2] is not " " and not text.endswith(
                            text[index]) and text[index + 1] is not " ":
                indices.append(index)

        for index in reversed(indices):
            text = text[:index] + " " + text[index:]

        text = re.sub('[^A-Za-zä-üÄ-Ü]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        words = []
        for word in tokenized_text:
            stemmed_word = stemmer.stem(word).strip()
            if stemmed_word not in stopwords.words(
                    'german') and word not in stopwords.words(
                        'german'
                    ) and len(stemmed_word) > 2 and stemmed_word not in [
                        'it', '3d'
                    ] and stemmed_word not in title:
                words.append(stemmed_word)

        corpus.append(' '.join(words))
    return corpus, ids
Пример #7
0
## First step: Tokenize each text
from nltk.tokenize import RegexpTokenizer

## Load library for removing stopwords
from nltk.corpus import stopwords
##nltk.download('stopwords') --> First time has to be downloaded

# Import libraries for stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize

stemmer_ps = PorterStemmer()

from nltk.stem.cistem import Cistem

stemmer_cs = Cistem()

# Import lemmatization libraries
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
#nltk.download('wordnet')

# Load stop words
stop_words = stopwords.words('english')
#print(stop_words[:5])

tokenizer = RegexpTokenizer(r'\w+')
texts_clean = []
texts_aux = []
aux = []
Пример #8
0
 def __init__(self, case_insensitive: BooleanValue()):
     self.case_insensitive = case_insensitive
     NltkStemmer.__init__(self)
     _Cistem.__init__(self, case_insensitive=case_insensitive)
Пример #9
0
def stem(text):
    stemmer = Cistem()
    for index, word in enumerate(text):
        text[index] = stemmer.stem(word)
    return text
Пример #10
0
import pandas as pd
import nltk
from nltk.stem.cistem import Cistem

# Initialise
tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
stemmer = Cistem()

# Read Dataset
tweets = pd.read_csv('data/all_tweets.tsv', sep='\t', header=None)

# Provide rehashed wordlist that is used to filter tweets by topic
keywords = []
index_topic_tweets = []
inp = open("../data/topic_wordlist.txt", "r")
for line in inp.readlines():
    line = line.replace('\n', '')
    keywords.append(line)

for word in keywords:
    word = word.replace('ä', 'ae')
    word = word.replace('ö', 'oe')
    word = word.replace('ü', 'ue')
    word = word.replace('ß', 'ss')
    word = word.lower()
    keyword = stemmer.stem(word)

for index, row in tweets.iterrows():
    # Tokenization
    words = tokenizer.tokenize(row[2])
    # Remove short tokens
Пример #11
0
 def _get_cis_stemmer(self, case_insensitive):
     return Cistem(case_insensitive=case_insensitive)
Пример #12
0
from nltk.stem.cistem import Cistem
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, f1_score
import re

np.random.seed(500)
stemmer = Cistem()
header_list = ["text_id","text","task_1","task_2"]

regex = re.compile('[,\.!?|#@;:!]')
Corpus = pd.read_csv("train_german.tsv",encoding='latin-1', sep="\t",names=header_list)
Corpus['text'].dropna(inplace=True)
Corpus['text'] = [entry.lower() for entry in Corpus['text']]
Corpus['text'] = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', entry) for entry in Corpus['text']]
Corpus['text'] = [entry.replace(".", " ") for entry in Corpus['text']]
Corpus['text'] = [regex.sub(' ', entry) for entry in Corpus['text']]
Corpus['text'] = [entry.split() for entry in Corpus['text']]
#Corpus['text']= [sent_tokenize(entry, language='german') for entry in Corpus['text']]


for index,entry in enumerate(Corpus['text']):
    Final_words = []
    for word in entry:
        if word not in stopwords.words('german') and word.isalpha():
Пример #13
0
def preprocess(text: str,
               bool_to_lowercase=True,
               bool_remove_html_tags=True,
               bool_remove_links=True,
               bool_remove_special_symbols=True,
               bool_remove_punctuation=True,
               bool_seperate_numbers_from_text=True,
               bool_stemming=True,
               bool_word_tokenize=True,
               bool_sentence_tokenize=False) -> str:

    # todo: lemmatization

    # transform to lower case
    if bool_to_lowercase:
        text = text.lower()

    # remove symbols, html-tags and links
    if bool_remove_html_tags:
        logging.debug('Removing html tags')
        text = remove_html_tags(text)

    if bool_remove_links:
        logging.debug('Removing links')
        text = remove_links(text)

    if bool_remove_special_symbols:
        logging.debug('Removing special symbols')
        text = remove_special_symbols(text)

    # separate numbers from text
    if bool_seperate_numbers_from_text:
        logging.debug('Separating numbers from text')
        text = seperate_numbers_from_text(text)

    if bool_remove_punctuation:
        logging.debug('Removing punctuation')
        text = remove_punctuation_from_text(text)

    # tokenize words
    if bool_word_tokenize:

        logging.debug('Apply word tokenizing')
        text_list = word_tokenize(text)

        if bool_stemming:
            logging.debug('Apply stemming')
            text_list = [
                Cistem(case_insensitive=False).stem(token)
                for token in text_list
            ]

        # remove stopwords
        logging.debug('Removing stopwords')
        text = remove_stop_words(text_list)

    if bool_sentence_tokenize:
        logging.debug('Apply sentence tokenizing')
        text = sent_tokenize(text, language='german')

    return text
Пример #14
0
if args.log_to_file:
    log_file_name = args.target.strip() + '.log'
else:
    log_file_name = None
logging.basicConfig(
    filename=log_file_name,
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

logging.info("Using up to {} CPUs".format(args.threads))
if args.lemmatize:
    logging.info("Perform lemmatisation using nltk")
    nlp_de = stanfordnlp.Pipeline(lang="de", processors="tokenize,lemma", use_gpu=True)
if args.stem:
    logging.info("Perform stemming using nltk''s CISTEM stemmer")
    stemmer = Cistem()

stop_words = stopwords.words('german')


# get corpus sentences
class CorpusSentences(object):
    def __init__(self, directory_name: str, chunk_size: int = 10000):
        self.directory_name = directory_name
        self.n = chunk_size

    def __iter__(self):
        for file_name in os.listdir(self.directory_name):
            logging.info("Use corpus file %s " % file_name)
            with codecs.open(os.path.join(self.directory_name, file_name), encoding="utf-8") as f:
                while True:
Пример #15
0
def wordCount(data, dictOutput, catList):
	# lade die stopwords
	stopwords = load_stopwords()

	# Create a new dictionary for the output
	outList = collections.OrderedDict()

	# Number of non-dictionary words
	nonDict = 0

	# Convert to lowercase
	data = data.lower().replace("\n", " ")

	# Tokenize and create a frequency distribution
	tokenizer = RegexpTokenizer(r'\w+')
	tokens = tokenizer.tokenize(data)

	fdist = nltk.FreqDist(tokens)
	wc = len(tokens)

	# Using the Cistem stemmer for wildcards, create a stemmed version of the data
	# Cistem: needed for german words/stemming
	cistem = Cistem()

	# wenn ein Wort/Token in den stopwords vorkommt, ignoriere dieses
	# ansonsten: speichere das gestemmte Wort in der Liste
	stems = [cistem.stem(word) for word in tokens if word not in stopwords and len(word) > 0]
	fdist_stem = nltk.FreqDist(stems)

	# Access categories and populate the output dictionary with keys
	for cat in catList:
		outList[cat[0]] = 0

	# Dictionaries are more useful
	fdist_dict = dict(fdist)
	fdist_stem_dict = dict(fdist_stem)

	# Number of classified words
	classified = 0

	for key in dictOutput:
		if "*" in key and key[:-1] in fdist_stem_dict:
			classified = classified + fdist_stem_dict[key[:-1]]
			for cat in dictOutput[key]:
				if cat.isalpha():
					outList[cat] = outList[cat] + fdist_stem_dict[key[:-1]]
		elif key in fdist_dict:
			classified = classified + fdist_dict[key]
			for cat in dictOutput[key]:
				try:
					outList[cat] = outList[cat] + fdist_dict[key]
				except KeyError:
					pass

	# Calculate the percentage of words classified
	if wc > 0:
		percClassified = (float(classified) / float(wc)) * 100
	else:
		percClassified = 0

	# Return the categories, the words used, the word count, the number of words classified,
	# and the percentage of words classified.
	return [outList, tokens, wc, classified, percClassified]
Пример #16
0
#!usr/bin/env python
#coding:utf8
from nltk.tokenize import TweetTokenizer
from nltk.stem.cistem import Cistem
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

nltk.download('stopwords')
tknzr= TweetTokenizer()
stemmer = Cistem(True)
file_in = open("../data/postillon.txt", "r")
file_out = open("../build/preprocessed/postillon_stem.txt", "w")
for line in file_in:
    tokenized = tknzr.tokenize(line)
    for word in tokenized:
        if word in stopwords.words('german'):
            tokenized.remove(word)
        word = stemmer.stem(word)
    token_text = " ".join(tokenized)   
    file_out.write(token_text+'\n')      
file_in.close()
file_out.close()

data = open("../build/preprocessed/postillon_stem.txt", "r")
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3))