Exemplo n.º 1
0
class ModelBuilder():

    def __init__(self):
        self.model = {}
        self.stemmer = SnowballStemmer('english')

    def build(self):
        with open('data/candidate_synonyms.txt') as f:
            all_words = f.read().split('\n')
            for words in all_words:
                if words:
                    word, similar = words.split(',')
                    word, similar = self.stemmer.stem(word), self.stemmer.stem(similar)
                    if word not in self.model: self.model[word] = {}
                    self.model[word][similar] = 1
        return self

    def condense(self):
        condensed_model = {}
        for word, similars in self.model.items():
            for similar in similars:
                if self.model.get(similar, {}).has_key(word):
                    if condensed_model.has_key(word):
                        condensed_model[word].append(similar)
                    else:
                        condensed_model[word] = [similar]
        self.model = condensed_model
        return self
def text_token_data_generator():
    global id_text_index_map
    translation_table = string.maketrans(
        string.punctuation + string.uppercase, " " * len(string.punctuation) + string.lowercase
    )
    snowball_stemmer = SnowballStemmer("english")
    for f in glob.glob("json/text/*.json"):
        for line in open(f).readlines():
            extract_row = json.loads(line)
            id_text_index_map[extract_row["file_id"]] = len(id_text_index_map)
            visible_text = extract_row["visible_text"].encode("ascii", "ignore")
            visible_text = visible_text.translate(translation_table)
            visible_text = [
                snowball_stemmer.stem(word)
                for word in visible_text.split()
                if word not in ENGLISH_STOP_WORDS and len(word) > 1
            ]
            title = extract_row["title"].encode("ascii", "ignore")
            title = title.translate(translation_table)
            title = [
                "t^{}".format(snowball_stemmer.stem(word))
                for word in title.split()
                if word not in ENGLISH_STOP_WORDS and len(word) > 1
            ]
            visible_text.extend(title)
            yield " ".join(visible_text)
def stemWordMatch2(question,sentence):


    question_tokens = set(nltk.word_tokenize(question))
    sentence_tokens=set(nltk.word_tokenize(sentence))

    #  Finding the match between two words from the same root  using Lancaster Stemmizer

    '''stemmer=LancasterStemmer()

    for i in sentence_tokens:
        stem_words_list.append(stemmer.stem(i))

    for i in question_tokens:
        question_words_list.append(stemmer.stem(i))

    #print 'Stem word list',stem_words_list
    #print 'Question word list', question_words_list

    stem_count=0
    for i in stem_words_list:
        #Finding the exact word match
        if i.lower() in [x.lower() for x in question_words_list]:
            #print 'Question word is',x
            #print 'Sentence word stem is :',i
            #print 'Match'
            stem_count=stem_count+6
    stem_word_match_counter.append(count)'''

    stem_word_match_counter=[]
    stem_words_list=[]
    question_words_list=[]

    #  Finding the match between two words from the same root  using Snowball Stemmizer

    snowball_stemmer = SnowballStemmer('english')

    for i in sentence_tokens:
        stem_words_list.append(snowball_stemmer.stem(i))

    for i in question_tokens:
        question_words_list.append(snowball_stemmer.stem(i))

    #print 'Stem word list',stem_words_list
    #print 'Question word list', question_words_list

    stem_count=0
    for i in stem_words_list:
        #Finding the exact word match
        if i.lower() in [x.lower() for x in question_words_list]:
            #print 'Question word is',x
            #print 'Sentence word stem is :',i
            #print 'Match'
            stem_count=stem_count+6
    #print 'Stem word count match score is :', stem_count

    return stem_count
Exemplo n.º 4
0
def wordnet_sim(query, db):
    """
    This function imlements simple wordnet definition lookup and compares it
    with a different block of text. For every word match between the definition
    token and text token doc receives +1.

    INPUT:
    query  --  string that represents user query expanded with word net defs
    db  --  dict representation of database xml file

    OUTPUT:
    maxdoc  --  the document with the highest score
    """
    # print('QUERY:', query)
    # initializing SnowballStemmer from nltk
    sst = SnowballStemmer("english")
    # taking stopwords from nltk
    stop = stopwords.words("english")
    # creating translation table to remove punctuation
    transnone = {ord(c): None for c in string.punctuation}
    # first we remove any punctuation and concatenate specific nodes into one
    query_nopunct = query.lower().translate(transnone)
    query_stems = [sst.stem(token) for token in query_nopunct.split() if token not in stop]
    doc_scores = defaultdict(float)
    for doc in db:
        for block, text in db[doc].items():
            # normalize block text
            if not text:
                continue
            text_nopunct = text.lower().translate(transnone)
            text = [sst.stem(t) for t in text_nopunct.split() if t not in stop]
            if len(text) == 0:
                text += " "
            # here we can finetune the block score multiplicators
            # some blocks are more important than the others
            if block == "description":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 2
            elif block == "trivia":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 0.5
            elif block == "history":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 0.5
            elif block == "comments":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text)
    maxdoc = max(doc_scores, key=lambda x: doc_scores[x])
    debug = sorted([(k, v) for k, v in doc_scores.items()], key=lambda x: x[1])
    return (debug, maxdoc)
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)
Exemplo n.º 6
0
def des_extrect():
    filename_list = []
    file_stopwords = file('stopwords.txt', "r")
    stopwords = [line.strip() for line in file_stopwords.readlines()]  
    for file_name in os.listdir(DESCRIPTION_DIR):
        filename_list.append(file_name) 
    for filename in filename_list:
        path =  os.path.join(DESCRIPTION_DIR, filename)
        fr = file(path, 'r')
        fw = file(filename+'.des', 'w')
        soup = BeautifulSoup(fr.read())
        docs = soup.findAll('doc')
        for doc in docs:
            content = str(doc['title'] + doc.snippet.text)
            content =  re.sub("[\.\@\,\:\;\!\?\(\)]".decode("utf8"), "".decode("utf8"),content)
            stemmer = SnowballStemmer('english')
            content = content.split()
            pro_content = ''
            for w in content: 
                w = stemmer.stem(w)
                #去停用词
                if w not in stopwords:
                    pro_content += w + ' '
            fw.write(doc['rank'] + ' ' +pro_content+'\n')
        fw.close()
        fr.close()
class StemmedCorpus(DocumentCorpus):
    def __init__(self, documents=None, language="german"):
        DocumentCorpus.__init__(self, documents)
        with codecs.open("stopwords/" + language, "r", encoding=my_encoding) as f:
            self._stopwords = [sw.strip() for sw in f.readlines()]
        self._stemmer = SnowballStemmer(language)
        self._lemmatizer = WordNetLemmatizer()
        self._stemmed_documents = []
    
    def preprocess_documents(self, lemmatize=False, remove_stopwords = True):
        _highest_func = self._lemmatize_tokens if lemmatize else self._stemm_tokens
        _second_highest_func = self._remove_stopword if remove_stopwords else lambda x: x
        self._stemmed_documents = [ (_highest_func(_second_highest_func(self._tokenize_document(doc[0].lower()))), doc[1] ) for doc in self._documents]

    def _tokenize_document(self, document):
        return regexp_tokenize(document, pattern_words)
    
    def _remove_stopword(self, tokens):
        return [token for token in tokens if token not in self._stopwords]
    
    def _stemm_tokens(self, tokens):
        return [self._stemmer.stem(token) for token in tokens]

    def _lemmatize_tokens(self, tokens):
        return [self._lemmatizer.lemmatize(token, trans_tag(tag)) for token, tag in pos_tag(tokens)]
Exemplo n.º 8
0
def stemmed(text,language):
    stemmer= SnowballStemmer(language)
    tas=text.split()
    text=""
    for word in tas:
        text=" ".join((text,stemmer.stem(word)))
    return text.lstrip()
def frequency_analysis(input_path, output_path, stopwords=None, n_most_common=50):
	recipes = []
	with open(input_path, 'r') as f:
		for i, line in enumerate(f):
			if line == '\n':
				break
			if i == 0:
				continue  # skip header
			fields = line.split('\t')
			recipes.append(fields[1].replace("\n", ""))
	recipe_text = re.sub("[^a-z ]", "", ' '.join(recipes))
	recipe_words = re.split("\s+", recipe_text)
	stemmer = SnowballStemmer("english")
	recipe_stems = [stemmer.stem(w) for w in recipe_words]
	if stopwords is not None:
		recipe_stems = filter(None, [s for s in recipe_stems if s not in stopwords])
	top_words = Counter(recipe_stems).most_common(n_most_common)

	# write to a file
	# do a second pass of the recipe to determine how many of the documents the term is in
	freq_table = open(output_path, 'wb')
	for elt in top_words:
		doc_freq = sum([elt[0] in recipe for recipe in recipes])
		freq_table.write(','.join([str(e) for e in elt]) +','+ str(doc_freq) + '\n')
	freq_table.close()
Exemplo n.º 10
0
class Cleaner(object):
	"""
	the sql query in get_reviews needs to be customized
	"""
	def __init__(self):
		self.sbstem = SnowballStemmer("english")
		replace = string.punctuation + string.digits
		self.replace_punctuation = string.maketrans(replace, ' '*len(replace))
		self.locations = []
		self.cached_stopwords = stopwords.words("english")

	def clean(self, txt):
		#removes stopwords, punctuation
	    txt = txt.encode('ascii','ignore')
	    nopunct = txt.translate(self.replace_punctuation)
	    no_locs = [x for x in nopunct.split() if x.lower() not in self.cached_stopwords]
	    stemmed = [self.sbstem.stem(x) for x in no_locs]
	    return " ".join(stemmed)

	def make_loclist(self, locations):
		locations = list(locations)
		removelist = ['Ho Chi Minh City', 'Phu Quoc Island', 'Halong Bay']
		locations = [x.lower() for x in locations if x not in removelist]		
		locations.extend(['ho chi minh','hoan','kiem','phu quoc', 'halong', 'vietnam', 'dong','vnd','vdn'])
		locations.extend(['vietnames', 'nhatrang','saigon','america','maryland','york'])
		loc_wordlist = [f.split() for f in locations]
		loc_wordlist = list(itertools.chain(*loc_wordlist))
		self.cached_stopwords.extend(loc_wordlist)
		return loc_wordlist
def norm_corpus(document_list):
    norm_doc_list = []
    
    # lowercase
    document_list = [word.lower() for word in document_list]

    
    # remove symbols in text
    symbols = ",.?!"
    for sym in symbols:
        document_list = [word.replace(sym,'') for word in document_list]
    
    
    # loop through each string i.e. review in the column
    for doc in document_list:
        doc = nltk.word_tokenize(doc)
        
        # remove stopwords
        doc = [word for word in doc if word not in stopwords.words('english')]
        
        # stem words
        stemmer = SnowballStemmer("english")
        doc = [stemmer.stem(word) for word in doc]
        
        # make tokenised text one string
        norm_doc = " ".join(doc)
        norm_doc_list.append(norm_doc)
    
    return norm_doc_list
def preprocessing(doc): #stop word as optional
        x = re.sub("[^a-zA-Z]", " ", doc) #only words
        x = x.lower().split()
        stemmer = SnowballStemmer("english") # use snowball
        stops = set(stopwords.words("english")) # set is faster than list
        x = [stemmer.stem(word) for word in x if word not in stops]
        return(x)
Exemplo n.º 13
0
class VocKeyworder(BaseKeyworder):
    def __init__(self):
        super(VocKeyworder, self).__init__()
        self._vocs = engvoc.voc2000
        self._lemmatizer = WordNetLemmatizer()
        self._stemmer1 = LancasterStemmer()
        self._stemmer2 = SnowballStemmer('english')

    def add_keyword(self, gag_id, title):
        tokens = re.split(' |\.|,|;|=', title)
        for token in tokens:
            token = re.sub(r"\W+$", '', token)
            token = re.sub(r"^\W+", '', token)
            vocs = []
            try:
                token = token.encode('utf8')
                vocs.append(re.sub(r"'\w+", '', token).lower())
                vocs.append(self._lemmatizer.lemmatize(vocs[0]))
                vocs.append(self._stemmer1.stem(vocs[0]))
                vocs.append(self._stemmer2.stem(vocs[0]))
            except UnicodeDecodeError:
                continue
            if vocs[0] == '':
                continue
            try:
                float(vocs[0])
                continue
            except ValueError:
                pass
            if not any([voc in self._vocs for voc in vocs]):
                print 'voc', vocs, token
                self._add_keyword(gag_id, token)
Exemplo n.º 14
0
def procesar(request, identificador):
	lmtzr = WordNetLemmatizer()
	d = Documento.objects.get(id=identificador)
	
	#nltk.corpus.cess_esp.words()
	
	
	tokens = nltk.word_tokenize(d.contenido.replace('.', ' . '))
	#print tokens
	#scentence = d.contenido

	#scentence = scentence.lower() 

	words = tokens
	spanish_stemmer = SnowballStemmer('spanish')
	

	#This is the simple way to remove stop words
	important_words=[]
	for word in words:
		if word not in stopwords.words('spanish'):
		    important_words.append([word, lmtzr.lemmatize(word), spanish_stemmer.stem(word)])




	return render_to_response('templates/documentoProcesado.html', 
				{
					'original': d.contenido,
					'tokens': tokens,
					'important_words' : important_words,
					#'pos_tags': pos_tags,
					#'ne_chunks': ne_chunks.subtrees(),
				})
Exemplo n.º 15
0
def normalized_token(token):
    """
    Use stemmer to normalize the token.
    建图时调用该函数,而不是在file_text改变词形的存储
    """
    stemmer = SnowballStemmer("english") 
    return stemmer.stem(token.lower())
Exemplo n.º 16
0
class Model:
    def __init__(self):
        self.model = ModelBuilder().build().condense().model
        self.stemmer = SnowballStemmer("english")

    def simset(self, word):
        stemmed_word = self.stemmer.stem(word)
        return self.model.get(stemmed_word, [])
Exemplo n.º 17
0
def stemLem(w):
	lemmatizer = WordNetLemmatizer()
	stemmer = SnowballStemmer("english")
	#stemmer = PorterStemmer()

	lem = lemmatizer.lemmatize(w)
	if len(w) > len(lem):
		return lem
	return stemmer.stem(w)
Exemplo n.º 18
0
def stemmed_top_user_words(usertxt, num=10):
	wl_usertxt = word_tokenize(usertxt.lower())
	num = min(num, len(wl_usertxt))

	snowball_stemmer = SnowballStemmer("english")
	stemmed_fl_usertxt = [snowball_stemmer.stem(w) for w in wl_usertxt if (len(w)>4 and w not in ewl)]
	fd_user_ls = [w[0] for w in FreqDist(Text(stemmed_fl_usertxt)).most_common(num)]

	return fd_user_ls
Exemplo n.º 19
0
 def __call__(self, doc ):
     snowball_stemmer = SnowballStemmer('english')
 	#tokenizer = RegexpTokenizer(r'\w+')
     #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)]
     words=[snowball_stemmer.stem(t) for t in word_tokenize(doc)]
     stop_words=set(stopwords.words('english'))
     stop_words.update(self.mystops)
     stop_words=list(stop_words)
     return [i.lower() for i in words if i not in stop_words]        
Exemplo n.º 20
0
def main(input_file, dbname):
    """
        Main function. Connects to a database and reads a\
        CSV with the arousal and valence. Uses the sentiment \
        library to compute the sentiment of a new.

          :param input_file: the ANEW file
          :param dbname: the name of the database

    """

    # read ANEW file
    if not os.path.exists(input_file):
        logging.error('File %s does not exist', input_file)
        sys.exit(1)
    else:
        csvfile = open(input_file, 'r')
        reader = csv.reader(csvfile, delimiter=',')
        reader.next()  # skip headers
        stemmer = SnowballStemmer('spanish')
        anew = dict([(stemmer.stem(unicode(row[2], 'utf-8')),
                      {'valence': float(row[3]),
                       'arousal': float(row[5])}) for row in reader])

    couch = couchdb.Server()
    database = couch[dbname]
    logging.info('Established connection with the db %s', dbname)

    for element in database:
        doc = database.get(element)

        comments = " ".join([comment['cleaned_summary']
                            for comment in doc['comments']])
        description = " ".join([database.get(element)['title'],
                                doc['description']])

        sentiment_comments = get_sentiment(anew, comments)
        sentiment_description = get_sentiment(anew, description)

        if sentiment_comments is not None and sentiment_description is not None:
            logging.info('%s val: %.2f - %.2f aro: %.2f - %.2f : %s',
                         doc.id, sentiment_comments[0],
                         sentiment_description[0],
                         sentiment_comments[1],
                         sentiment_description[1],
                         doc['title'])
            doc['sentiments'] = {'comments':
                                {'valence': sentiment_comments[0],
                                 'arousal': sentiment_comments[1]},
                                 'description':
                                {'valence': sentiment_description[0],
                                 'arousal': sentiment_description[1]}}
            database.save(doc)

        else:
            logging.warn('%s could not be analyzed. skiping ...',
                         database.get(element)['title'])
def stemWordMatch(question,sentence):

    snowball_stemmer = SnowballStemmer('english')

    question_tokens = set(nltk.word_tokenize(question))
    sentence_tokens=set(nltk.word_tokenize(sentence))

    #print 'Question is :',question_tokens
    #print 'Sentence is :',sentence_tokens
    count=0
    for i in sentence_tokens:
        #Finding the exact word match
        if snowball_stemmer.stem(i).lower() in  [snowball_stemmer.stem(x).lower() for x in question_tokens]:
            count=count+6
        elif i.lower() in [x.lower() for x in question_tokens]:
            count=count+3
    #print 'Exact word match count is :',count
    return count
Exemplo n.º 22
0
def preprocess_tweets(tweets):
    stemmer = SnowballStemmer("english")
    stop = set(stopwords.words("english"))
    tweet_texts = [ " ".join(stemmer.stem(i) if len(i) > 1 else i
                                for i in ("".join(c for c in word if c not in string.punctuation)
                                            for word in tweet["text"].lower().split())
                                if i and i not in stop)
                    for tweet in tweets ]
    return list(set(tweet_texts))
    def stem(self, content):
        import re

        original_string = content
        new_content = re.sub('[^a-zA-Z0-9\n\.]', ' ', original_string)
        words = new_content.split()
        stemmer = SnowballStemmer('english')
        singles = [stemmer.stem(wordsa) for wordsa in words]
        return (' '.join(singles))
Exemplo n.º 24
0
    def __init__(self, data="en600.468/aligner/data/hansards", num_sents=sys.maxint):
        f_data = "%s.%s" % (data, "f")
        e_data = "%s.%s" % (data, "e")

        bitext = [[sentence.strip().split() for sentence in pair] for pair in zip(open(f_data), open(e_data))[:num_sents]]

        # Stem words before model training
        french_stemmer = SnowballStemmer("french")
        english_stemmer = SnowballStemmer("english")
        bitext_stemmed = []
        for (n, (f, e)) in enumerate(bitext):
            f_stemmed = [french_stemmer.stem(word.decode("utf-8")) for word in f]
            e_stemmed = [english_stemmer.stem(word) for word in e]
            bitext_stemmed.append([f_stemmed, e_stemmed])

        bitext = bitext_stemmed

        self._train(bitext)
        self._align(bitext)
Exemplo n.º 25
0
    def stem_text(self):
        '''
        Perform stemming
        '''

        stemmer = SnowballStemmer("english")
        stemmed_sents = []
        for sent in self.tok_text:
            stemmed_sents.append([stemmer.stem(tok) for tok in sent])
        self.stem_text = stemmed_sents
Exemplo n.º 26
0
def filterFile(filein,fileout):
    #Turn to lowercase, remove Spaces, replace Or Remove Special Symbols 
    #and stem words

    fin = open(filein,"r")
    fout = open(fileout,"w")

    snowball = SnowballStemmer('english')
    end = "End of the Project Gutenberg"

    #remove the header
    lines = fin.readline()
    while re.match(r'^Title',lines)==None:
        lines = fin.readline()
        
    lines = re.sub(r'[:,]+',"",lines)
    words = lines.split()
    for word in words: 
        fout.write(snowball.stem(word.lower())+" ")

    #Filter and stem
    for line in fin:
        if end in line:
            break
        elif re.search('\S',line):
            line = line.lower()
            line = re.sub(r'\s+'," ",line)
            line = re.sub(r'&',"and",line)
            line = re.sub(r'[\[\]\'\"()@#$%^&*?\|!.,:;]+|(--)+','',line)
            line = re.sub(r'[_-]+'," ",line)
            words = line.split()
            for word in words: 
                try:
                    fout.write(snowball.stem(word)+" ")
                except ValueError:
                    #REMOVES ALL ACCENTS
                    word = unicode(word,"utf-8")
                    word = unidecode(word)
                    fout.write(snowball.stem(word)+" ")
        
    fin.close()
    fout.close()
Exemplo n.º 27
0
	def stemming(self, words):
		''' Make stem for each word in array 
		@return array of stemming words	'''

		russian_stemmer = SnowballStemmer('russian')
		stemming = list()
		for w in words:
			try:
				stemming.append(russian_stemmer.stem(w))
			except Exception, e:
				pass
Exemplo n.º 28
0
def process_spanish_owned():
    from inflector import Inflector, Spanish
    inflector = Inflector(Spanish)

    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer("spanish")

    file_valid = open('valid_words.txt', "r")
    lines = file_valid.readlines()
    valid_words = lines[0].split(' ')
    print len(valid_words)
    file_valid.close()
    #valid_words = set(valid_words)
    owned_words = ['cúster', 'custer', 'cústers', 'custers', 'combi', 'combis', 'susana', 'villaran', 'villarán', 'castañeda']

    file = open("raw_words.txt", 'r')
    fileout = open("spanish_words_owned.txt", 'w')
    fout_sing = open("spanish_words_sing.txt", 'w')
    fout_stem = open("spanish_words_stem.txt", 'w')
    nline = 0

    for line in file:
        nline += 1
        words = line.split(' ')
        processed = []
        ini_line = True
        for word in words:
            if (word != '') & (word != '\n') & (word != 'servicio') & (word != 'servicio\n'):
                word = word.replace('\n', '')
                if (word in valid_words) | (word in owned_words):
                    processed.append(word)
                    if word != 'bus':
                        word_singular = inflector.singularize(word)
                        #word_singular = word_singular.replace(u'\xF3'.encode('utf-8'), 'o')
                    else:
                        word_singular = word
                    word_stemmed = stemmer.stem(word.decode('utf-8')).encode('utf-8')
                    if ini_line:
                        fileout.write(word)
                        fout_sing.write(word_singular)
                        fout_stem.write(word_stemmed)
                        ini_line = False
                    else:
                        fileout.write(' ' + word)
                        fout_sing.write(' ' + word_singular)
                        fout_stem.write(' ' + word_stemmed)
                    print nline, word, word_singular, word_stemmed
        fileout.write('\n')
        fout_sing.write('\n')
        fout_stem.write('\n')
    file.close()
    fileout.close()
    fout_sing.close()
    fout_stem.close()
def tokenize(resultList1):
    entrada=[]
    for i in range(0,len(resultList1)):
        sentence=resultList1[i]
        tokens = word_tokenize(sentence)
        filtered_words = [w for w in tokens if not w in stopwords.words('spanish')]

        stemmer = SnowballStemmer('spanish')
        for i in filtered_words:
            entrada.append( stemmer.stem(i))
    return entrada
def tokenize(resultList1):
    entrada=[]
    tokens = word_tokenize(resultList1)
    filtered_words = [w for w in tokens if not w in stopwords.words('spanish')]

    stemmer = SnowballStemmer('spanish')
    for i in filtered_words:
        stri = unicode(i,errors='replace')
        entrada.append(stemmer.stem(stri))

    return entrada
Exemplo n.º 31
0
def stem_txt(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])
# Stopwords
#####

from nltk.corpus import stopwords
stoplist = stopwords.words('english')
# keep if not a stopword
nostop = [t for t in norm_numbers if t not in stoplist]
print(nostop)

#####
# Stemming
#####

from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('german')  # snowball stemmer, german
print(stemmer.stem("Autobahnen"))
stemmer = SnowballStemmer('english')  # snowball stemmer, english
# remake list of tokens, replace with stemmed versions
tokens_stemmed = [stemmer.stem(t) for t in tokens]
print(tokens_stemmed)

# other options:
# from nltk.stem import PorterStemmer
# from nltk.stem import WordNetLemmatizer

#####
# Corpus statistics
#####
docs = df1['snippet']

print(len(sentences), 'sentences in corpus.')
Exemplo n.º 33
0
def stemmatize(text):
    sb = SnowballStemmer("english")
    return sb.stem(text)
Exemplo n.º 34
0
class SnowballTokenizer(object):
    def __init__(self):
        self.sbs = SnowballStemmer('english')

    def __call__(self, doc):
        return [self.sbs.stem(t) for t in word_tokenize(doc)]
Exemplo n.º 35
0
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
Exemplo n.º 36
0
print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

print rs.stem('lying')

print rs.stem('strange')

# snowball stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("german")

print 'Supported Languages:', SnowballStemmer.languages

# autobahnen -> cars
# autobahn -> car
ss.stem('autobahnen')

# springen -> jumping
# spring -> jump
ss.stem('springen')

# lemmatization
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

# lemmatize nouns
print wnl.lemmatize('cars', 'n')
print wnl.lemmatize('men', 'n')

# lemmatize verbs
Exemplo n.º 37
0
class _AutoTag(object):
    """ A class to auto tag posts, using tf-idf. """

    WORDS = '([A-Za-z]+[A-Za-z-]*[A-Za-z]+|[A-Za-z]+)'

    def tag(self, post, count=5):
        """ Return a list of top tags, given a post.

        post: can either be a post object or the source path
        count: the number of tags to return

        """

        if isinstance(post, (bytes_str, unicode_str)):
            source_path = post
            post = self._get_post_from_source_path(source_path)
            if post is None:
                LOGGER.error('No post found for path: %s' % source_path)
                return

        return self._find_top_scoring_tags(post, count)

    # ### 'object' interface ###################################################

    def __init__(self, site, use_nltk=True):
        """ Set up a dictionary of documents.

        Each post is mapped to a list of words it contains.

        """

        self._site = site
        self._documents = {}
        self._stem_cache = {}
        self._use_nltk = use_nltk and self._nltk_available()
        self._tag_set = set([])

        if self._use_nltk:
            from nltk.corpus import stopwords
            from nltk.tokenize import word_tokenize
            from nltk.stem import SnowballStemmer

            self._tag_pattern = re.compile(self.WORDS + '$')
            self._tokenize = word_tokenize
            self._stem_word_mapping = defaultdict(set)
            self._stemmer = SnowballStemmer('porter')
            self._stopwords = set(stopwords.words())

        else:
            self._tag_pattern = re.compile(self.WORDS)

        self._process_tags()
        self._process_posts()
        self._document_count = len(self._documents)

    # ### 'Private' interface ##################################################

    def _find_stems_for_words_in_documents(self, text):
        """ Process text to get list of stems. """

        words = []

        for word in self._tokenize(text):
            if self._tag_pattern.match(word) is not None:
                if word not in self._stopwords:
                    words.append(self._get_stem_from_cache(word))

        return words

    def _find_top_scoring_tags(self, post, count):
        """ Return the tags with the top tf-idf score. """

        tf_idf_table = {}

        for word in self._documents[post.source_path]:
            tf_idf_table[word] = self._tf_idf(word, post)
            tags = sorted(
                tf_idf_table, key=lambda x: tf_idf_table[x], reverse=True
            )

        if self._use_nltk:
            tags = [
                sorted(self._stem_word_mapping[tag], key=len)[0]
                for tag in tags[:count]
            ]

        else:
            tags = tags[:count]

        return tags

    def _get_post_from_source_path(self, source):
        """ Return a post given the source path. """

        posts = [
            post for post in self._site.timeline
            if post.source_path == source
        ]

        post = posts[0] if len(posts) == 1 else None

        return post

    def _get_post_text(self, post):
        """ Return the text of a given post. """

        with codecs.open(post.source_path, 'r', 'utf-8') as post_file:
            post_text = post_file.read().lower()
            if not post.is_two_file:
                post_text = post_text.split('\n\n', 1)[-1]

        return post_text

    def _get_word_count(self, post):
        """ Get the count of all words in a given post. """

        word_counts = defaultdict(lambda: 0)

        for word in self._documents[post.source_path]:
            word_counts[word] += 1

        return word_counts

    def _get_stem_from_cache(self, word):
        """ Return the stem for a word, and cache it, if required. """

        if word not in self._stem_cache:
            stem = self._stemmer.stem(word)
            self._stem_cache[word] = stem
            self._stem_word_mapping[stem].add(word)
        else:
            stem = self._stem_cache[word]

        return stem

    def _modified_inverse_document_frequency(self, word):
        """ Gets the inverse document frequency of a word.

        This departs from the normal inverse document frequency
        calculation, to give a higher score for words that are already
        being used as tags in other posts.
        """

        if word not in self._tag_set:
            count = sum(
                1 for doc in self._documents.values() if word.lower() in doc
            )
        else:
            count = 0.25

        return math.log(self._document_count / float(count))

    @staticmethod
    def _nltk_available():
        """ Return True if we can import nltk. """

        try:
            import nltk
        except ImportError:
            nltk = None

        return nltk is not None

    def _process_posts(self):
        """ Tokenize the posts (and stem the words, if use_nltk). """

        for post in self._site.timeline:

            text = self._get_post_text(post)

            if not self._use_nltk:
                words = self._tag_pattern.findall(text)

            else:
                words = self._find_stems_for_words_in_documents(text)

            self._documents[post.source_path] = words

    def _process_tags(self):
        """ Create a tag set, to be used during tf-idf calculation. """

        tags = self._site.posts_per_tag.keys()

        if not self._use_nltk:
            self._tag_set = set(tags)

        else:
            self._tag_set = set(self._get_stem_from_cache(tag) for tag in tags)

    def _term_frequncy(self, word, post):
        """ Returns the frequency of a word, given a post. """

        word_counts = self._get_word_count(post)

        # A mix of augmented, logarithmic frequency.  We divide with
        # the max frequency to prevent a bias towards longer document.
        tf = math.log(
            1 + float(word_counts[word]) / max(word_counts.values())
        )

        return tf

    def _tf_idf(self, word, post):
        """ Return tf-idf value of a word, in a specified post. """

        tf = self._term_frequncy(word, post)
        idf = self._modified_inverse_document_frequency(word)

        return tf * idf
Exemplo n.º 38
0
def clean(text, remove_stopwords=False, stem_words=False):

    text = text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    #print(text)
    text = re.sub(r"b'rt", '', text)
    text = re.sub(r"http.+?\s", '', text)
    text = re.sub(r"@.+?\s", '', text)
    text = re.sub(r"[0-9]+", '', text)
    text = re.sub(r"x[a-z][0-9]", '', text)
    text = re.sub(r"x[a-z]", '', text)
    text = re.sub(r"x", '', text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    '''
    tokens = word_tokenize(text)
    pos_text = pos_tag(tokens)
    new_text = ' '.join([i[0]+'-'+i[1] for i in pos_text])
    '''

    # Return a list of words
    return (text)
Exemplo n.º 39
0
class process_list:
    def __init__(self, debug=False):
        # LIST person_type
        self._polarity = []
        self._other = []
        # StopWord
        self._stopwords = nltk.corpus.stopwords.words('spanish')
        self._stopwords_no_accents = []
        for w in self._stopwords:
            self._stopwords_no_accents.append(
                self.delete_special_characters(self.delete_accents(w)))
        # Stemmer
        self._spanis_stemmer = SnowballStemmer('spanish')
        #Debug Print Message
        self.debug = debug

    def load_list(self,
                  type_file_parm,
                  _file_type="lexicons/politico.csv",
                  separator="\t"):

        if (type_file_parm is type_file_enum.polarity):
            with open(_file_type, newline='') as csvFileBow:
                reader = csv.reader(csvFileBow, delimiter=separator)
                for row in reader:
                    new_row = [
                        row[0],
                        self.delete_accents(row[0]),
                        self.delete_special_characters(
                            self.delete_accents(row[0])),
                        self._spanis_stemmer.stem(
                            self.delete_accents(self.delete_accents(row[0]))),
                        row[1]
                    ]
                    self._polarity.append(new_row)
                    #print(new_row)
                print("Summary Lexicon: ", _file_type, " #Words: ",
                      len(self._polarity))

        if (type_file_parm is type_file_enum.other):
            with open(_file_type, newline='') as csvFileBow:
                reader = csv.reader(csvFileBow, delimiter=';')
                for row in reader:
                    new_row = [
                        row[0],
                        self.delete_accents(row[0]),
                        self.delete_special_characters(
                            self.delete_accents(row[0])),
                        self._spanis_stemmer.stem(
                            self.delete_accents(self.delete_accents(row[0]))),
                        row[1] + "-" + row[2] + "-" + row[3]
                    ]
                    self._other.append(new_row)
                    #print(new_row)

    def filter_word(self, word, type_file_parm):
        debug = True
        if (type_file_parm is type_file_enum.polarity):
            original = word
            matching = [[original, s[0]] for s in self._polarity
                        if word == s[0]]
            if debug:
                print("1.Characters ", original, "-", word, " - ", matching)
            if (len(matching) == 0):
                word = self.delete_accents(word)
                matching = [[original, s[1]] for s in self._polarity
                            if word == s[1]]
                if debug:
                    print("2.Characters ", original, "-", word, " - ",
                          matching)
                if (len(matching) == 0):
                    word = self.delete_special_characters(
                        self.delete_accents(word))
                    matching = [[original, s[2]] for s in self._polarity
                                if word == s[2]]
                    if debug:
                        print("3.Characters ", original, "-", word, " - ",
                              matching)
                    if (len(matching) == 0):
                        word = self._spanis_stemmer.stem(
                            self.delete_special_characters(
                                self.delete_accents(word)))
                        matching = [[original, s[3]] for s in self._polarity
                                    if word == s[3]]
                        if debug:
                            print("4.Characters ", original, "-", word, " - ",
                                  matching)
                        if (len(matching) == 0):
                            return "No identificado", 0
                        else:
                            return "No personal", matching
                    else:
                        return "No personal", matching
                else:
                    return "No personal", matching
            else:
                return "No personal", matching
        return "No identificado", 0

    def process_text(self, text):

        result_words = []
        result_polarity = None
        polarity_word = None
        polarity_value = 0
        polarity_average = 0
        polarity_label = ""
        counter = 0
        neg = 0
        pos = 0
        value = 0

        try:
            words_text = nltk.word_tokenize(text)
            words_text = self.delete_stopword(words_text)

            for word in words_text:
                polarity_word = self.filter_word_generic(
                    word, type_file_enum.polarity)
                #print("Polarity word =", polarity_word)
                #print(polarity_word)
                if polarity_word[1] != 0:
                    counter = counter + 1
                    if int(polarity_word[1][0][2]) < 0:
                        #counter = counter + 1
                        neg = neg - 1
                    elif int(polarity_word[1][0][2]) > 0:
                        #counter = counter + 1
                        pos = pos + 1
                    result_words.append(polarity_word)
                    if self.debug: print(polarity_word)

            if abs(neg) > pos:
                value = neg
                #value = pos - abs(neg)
            else:
                value = pos
                #value = abs(neg) - pos

            if counter != 0:
                polarity_average = value / counter
            # polarity_average = value
            else:
                polarity_average = 0

            polarity_value = round(polarity_average)
            #polarity_value = 0

            #if polarity_average >= 2:
            #polarity_label = "Positivo"
            #polarity_value = 1
            #elif polarity_average <= -2:
            #polarity_label = "Negativo"
            #polarity_value = -1
            #else:
            #polarity_label = "Neutro"

            if polarity_value == 1:
                polarity_label = "Positivo"
            elif polarity_value == -1:
                polarity_label = "Negativo"
            else:
                polarity_label = "Neutro"
        except:
            print("Error - polarity : ", polarity_word)

        result_polarity = {
            'Polarity': polarity_value,
            'Average': polarity_average,
            'Label': polarity_label,
            'Words': result_words
        }
        return result_polarity

    def filter_word_generic(self, word, type_file_parm):

        if (type_file_parm is type_file_enum.polarity):
            return self.internal_count_list(word, self._polarity, {
                "error": "No identificado",
                "successful": "polarity"
            })
        if (type_file_parm is type_file_enum.occupation):
            return self.internal_count_list(word, self._other, {
                "error": "No identificado",
                "successful": "other"
            })
        return "ERROR 0001 - LIST - NOT FOUND", 0

    def internal_count_list(self, word, list_porcess, response_text):
        original = word
        matching = [[original, s[0], s[4]] for s in list_porcess
                    if word == s[0]]
        if self.debug: print("-----------------------------------")
        if self.debug:
            print("1.Characters ", original, "-", word, " - ", matching)
        if (len(matching) == 0):
            word = self.delete_accents(word)
            matching = [[original, s[1], s[4]] for s in list_porcess
                        if word == s[1]]
            if self.debug:
                print("2.Characters ", original, "-", word, " - ", matching)
            if (len(matching) == 0):
                word = self.delete_special_characters(
                    self.delete_accents(word))
                matching = [[original, s[2], s[4]] for s in list_porcess
                            if word == s[2]]
                if self.debug:
                    print("3.Characters ", original, "-", word, " - ",
                          matching)
                if (len(matching) == 0):
                    word = self._spanis_stemmer.stem(
                        self.delete_special_characters(
                            self.delete_accents(word)))
                    matching = [[original, s[3], s[4]] for s in list_porcess
                                if word == s[3]]
                    if self.debug:
                        print("4.Characters ", original, "-", word, " - ",
                              matching)
                    if (len(matching) == 0):
                        return response_text["error"], 0
                    else:
                        return response_text["successful"], [matching[0]]
                else:
                    return response_text["successful"], [matching[0]]
            else:
                return response_text["successful"], [matching[0]]
        else:
            return response_text["successful"], [matching[0]]

    #Eliminar las Tildes
    def delete_accents(self, _word):
        return ''.join((c for c in unicodedata.normalize('NFD', _word)
                        if unicodedata.category(c) != 'Mn'))

    #Elimina Caracteres Especiales
    def delete_special_characters(self, lin):
        lin = re.sub('\/|\\|\\.|\,|\;|\:|\n|\?|\'|\t', ' ',
                     lin)  # quita los puntos
        lin = re.sub("\s+\w\s+", " ", lin)  # quita los caractores solos
        lin = re.sub("\.", "", lin)
        lin = re.sub(" ", "", lin)
        return lin.lower()

    #Elimina los StopWords
    def delete_stopword(self, text):
        return_data = []
        for word in text:
            if (word.lower() not in self._stopwords_no_accents) and (
                    word != "") and (len(word) > 2):
                return_data.append(word.lower())
        return return_data
Exemplo n.º 40
0
def preprocess_text(text,
                    tokenize=False,
                    ner=False,
                    stem=False,
                    stopw=False,
                    all_lower=False,
                    strip_punct=True):
    """
    Preprocesses and cleans text
    :param ner: Do Named Entity Recognition and join into one word
    :param stem: Stem text
    :param stopw: Remove stopwords
    :param all_lower: lowercase text
    :param strip_punct: strips punctuation
    :return: preprocessed text
    """

    # Clean the text
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"i\.e\.", "", text)
    text = re.sub(r"\.", " . ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r'"', " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r"^e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"^b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"^u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)

    if ner:
        tokenized_text = word_tokenize(text)
        tagged_text = pos_tag(tokenized_text)
        chunked_text = ne_chunk(tagged_text, binary=True)

        named_entities = extract_entity_names(chunked_text)
        for named_entity in named_entities:
            entity = named_entity.replace(".", "")
            entity = re.sub(r'\s+', "_", entity)
            text = text.replace(named_entity, entity)

    if all_lower:
        text = text.lower()

    if stopw:
        global stops
        if stops is None:
            try:
                stops = set(stopwords.words("english"))
            except Exception as e:
                print("%s - Please download english stopwords from NLTK" % e)
                exit()
        text = [word.strip() for word in text.split() if word not in stops]
        text = " ".join(text)

    if tokenize:
        text = word_tokenize(text)
        text = " ".join(text)

    # shorten words to their stems
    if stem:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    if strip_punct:
        text = text.translate(str.maketrans('', '', string.punctuation))

    text = text.strip()

    # Empty string
    if text == '':
        return EMPTY_TOKEN

    return text
Exemplo n.º 41
0
class Preprocess():
    def __init__(self, rootDir, word_dict, inv_words):
        self.rootDir_ = rootDir
        self.class_words_dict_ = word_dict
        self.inv_words_dict_ = inv_words
        self.imputer_ = KNN(k=1)
        self.enc_ = OrdinalEncoder()
        self.spanish_stemmer_ = SnowballStemmer('spanish')
        self.special_words_ = ['piez']
        self.stopwords_spanish_ = stopwords.words('spanish')
        self.df_ = pd.DataFrame(columns=[
            'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca',
            'Empaque', 'Contenido', 'UnidadMedida', 'LocalidadGeografica',
            'Fuente', 'precio', 'fecha'
        ])

        self.data_ = self.import_data()
        self.add_stop_words()
        self.preprocess('descripcion')
        self.categorize()
        self.append_df()
        self.join_marca_submarca_drop_null()
        self.imputation()
        self.inv_words_funct()
        self.drop_unused_columns()

    def import_data(self):
        '''
        Import all files in a library without subfolders
        '''
        data = {}
        path = self.rootDir_ + '*.csv'
        for fname in glob.glob(path):
            data[fname.split('\\')[1].split('.csv')[0]] = pd.read_csv(
                fname, index_col=0)
            try:
                data.get(fname.split('\\')[1].split('.csv')
                         [0])['fecha'] = pd.to_datetime(data.get(
                             fname.split('\\')[1].split('.csv')[0])['fecha'],
                                                        format='%d-%m-%Y')
            except KeyError:
                print('Check datetime values, as I didnt find them.')
        return data

    def add_stop_words(self):
        new_stop_words = ['s']
        self.stopwords_spanish_.extend(new_stop_words)

        return self

    def tokenize(self, data):
        '''
        Input: the complete strins
        Output: the tokenize string in a list of strings
        '''
        return word_tokenize(data)

    def remove_stopwords_punctuation(self, data):
        clean_description = []
        for word in data:
            if (word not in self.stopwords_spanish_
                    and word not in string.punctuation):
                clean_description.append(word)

        return clean_description

    def remove_accents(self, data):

        return [unidecode.unidecode(word) for word in data]

    def lowercasing(self, data):

        return [word.lower() for word in data]

    def stemming(self, data):

        return [self.spanish_stemmer_.stem(word) for word in data]

    def remove_duplicates(self, data):
        seen = set()
        result = []
        for item in data:
            if item not in seen:
                seen.add(item)
                result.append(item)

        return result

    def split_number_letter(self, data):
        result = []
        for word in data:
            match = re.match(r'([0-9]+)([a-z]+)', word, re.I)
            if match:
                for element in match.groups():
                    result.append(element)
            else:
                result.append(word)
        return result

    def remove_special_char(self, data):
        result = []
        for word in data:
            if (word not in self.special_words_):
                result.append(word)
        return result

    def preprocess(self, column_name):
        for values in self.data_.values():
            values[column_name] = values.apply(
                lambda row: self.tokenize(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_accents(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.lowercasing(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.split_number_letter(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_stopwords_punctuation(row[column_name]
                                                              ),
                axis=1)
            values[column_name] = values.apply(
                lambda row: self.stemming(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_special_char(row[column_name]), axis=1)
            values[column_name] = values.apply(
                lambda row: self.remove_duplicates(row[column_name]), axis=1)
        return self

    def append_df(self):
        for element in self.data_.keys():
            self.df_ = self.df_.append(self.data_.get(element),
                                       ignore_index=True)

        return self

    def categorize(self):
        for base_key in self.data_.keys():
            self.data_.get(base_key).reset_index(drop=True, inplace=True)
            columns_to_add = [
                'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca',
                'Empaque', 'Contenido', 'UnidadMedida'
            ]
            for i in columns_to_add:
                self.data_.get(base_key)[i] = np.nan
            self.data_.get(base_key)['Fuente'] = base_key
            for row in range(len(self.data_.get(base_key))):
                for element in self.data_.get(base_key)['descripcion'][row]:
                    if element in self.class_words_dict_.get('Tipo'):
                        self.data_.get(base_key)['Tipo'].loc[row] = element
                    if element in self.class_words_dict_.get('Tipo_2'):
                        self.data_.get(base_key)['Tipo_2'].loc[row] = element
                    if element in self.class_words_dict_.get('Tipo_3'):
                        self.data_.get(base_key)['Tipo_3'].loc[row] = element
                    if element in self.class_words_dict_.get('Tipo_4'):
                        self.data_.get(base_key)['Tipo_4'].loc[row] = element
                    if element in self.class_words_dict_.get('Marca'):
                        self.data_.get(base_key)['Marca'].loc[row] = element
                    if element in self.class_words_dict_.get('Submarca'):
                        self.data_.get(base_key)['Submarca'].loc[row] = element
                    if element in self.class_words_dict_.get('Empaque'):
                        self.data_.get(base_key)['Empaque'].loc[row] = element
                    if element in self.class_words_dict_.get('Contenido'):
                        self.data_.get(
                            base_key)['Contenido'].loc[row] = element
                    if element in self.class_words_dict_.get('UnidadMedida'):
                        self.data_.get(
                            base_key)['UnidadMedida'].loc[row] = element

        return self

    def join_marca_submarca_drop_null(self):
        self.df_['Submarca'].fillna('', inplace=True)
        self.df_['Marca'] = self.df_['Marca'] + self.df_['Submarca']
        self.df_.drop(['Submarca'], axis=1, inplace=True)
        self.df_.dropna(subset=['Tipo'], inplace=True)

        return self

    def imputation(self):
        self.df_.fillna('', inplace=True)
        self.df_.reset_index(drop=True, inplace=True)
        for row in range(len(self.df_)):
            if self.df_.Tipo.loc[row] == 'huev' and self.df_.UnidadMedida.loc[
                    row] == '':
                self.df_['UnidadMedida'].loc[row] = 'pz'
            if self.df_.Tipo.loc[
                    row] == 'tortill' and self.df_.UnidadMedida.loc[row] == '':
                self.df_['UnidadMedida'].loc[row] = 'pz'
            if self.df_.Tipo.loc[row] == 'papel' and self.df_.UnidadMedida.loc[
                    row] == '':
                self.df_['UnidadMedida'].loc[row] = 'roll'
            if self.df_.Tipo.loc[row] == 'lech' and self.df_.UnidadMedida.loc[
                    row] == '':
                self.df_['UnidadMedida'].loc[row] = 'l'
            if self.df_.Contenido.loc[row] == '':
                self.df_['Contenido'].loc[row] = '1'
            if self.df_.Marca.loc[row] == '':
                self.df_['Marca'].loc[row] = 'no_especificado'
            if self.df_['Tipo_2'].loc[row] == '':
                if self.df_['Tipo_4'].loc[row] == '' and self.df_[
                        'Tipo_3'].loc[row] == '':
                    self.df_['Tipo_2'].loc[row] = 'no_especificado'
                else:
                    if self.df_['Tipo_4'].loc[row] == '':
                        self.df_['Tipo_2'].loc[row] = self.df_['Tipo_3'].loc[
                            row]
                    else:
                        if self.df_['Tipo_3'].loc[row] == '':
                            self.df_['Tipo_2'].loc[row] = self.df_[
                                'Tipo_4'].loc[row]
                        else:
                            self.df_['Tipo_2'].loc[
                                row] = self.df_['Tipo_3'].loc[
                                    row] + '_' + self.df_['Tipo_4'].loc[row]
            else:
                if self.df_['Tipo_4'].loc[row] == '' and self.df_[
                        'Tipo_3'].loc[row] == '':
                    self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[row]
                else:
                    if self.df_['Tipo_4'].loc[row] == '':
                        self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[
                            row] + '_' + self.df_['Tipo_3'].loc[row]
                    else:
                        if self.df_['Tipo_3'].loc[row] == '':
                            self.df_['Tipo_2'].loc[
                                row] = self.df_['Tipo_2'].loc[
                                    row] + '_' + self.df_['Tipo_4'].loc[row]
                        else:
                            self.df_['Tipo_2'].loc[row] = self.df_[
                                'Tipo_2'].loc[row] + '_' + self.df_[
                                    'Tipo_3'].loc[row] + '_' + self.df_[
                                        'Tipo_4'].loc[row]
        self.knn_imputer_for_empaque()

        return self

    def knn_imputer_for_empaque(self):
        data = self.df_.copy(deep=True)
        data['Empaque'][(data['Empaque'] == '')] = np.nan
        # initialize variables
        ordinal_enc_dict = {}
        columns_to_encode = ['Tipo', 'Tipo_2', 'Empaque']
        # loop over columns to encode
        for col_name in data[columns_to_encode]:
            # create ordinal encoder for the column
            ordinal_enc_dict[col_name] = OrdinalEncoder()
            # select the non-null values in the column
            col = data[col_name]
            col_not_null = col[col.notnull()]
            reshaped_vals = col_not_null.values.reshape(-1, 1)
            # encode the non-null values of the column
            encoded_vals = ordinal_enc_dict[col_name].fit_transform(
                reshaped_vals)
            # store the values to non-null values of the column in data
            data.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
        # imputing with KNN
        data.iloc[:,
                  [data.columns.get_loc(col_)
                   for col_ in columns_to_encode]] = np.round(
                       self.imputer_.fit_transform(data[columns_to_encode]))
        for col_name in data[columns_to_encode]:
            # reshape the data
            reshaped = data[col_name].values.reshape(-1, 1)
            # perform inverse transformation of the ordinally encoded columns
            data[col_name] = ordinal_enc_dict[col_name].inverse_transform(
                reshaped)

        self.df_ = data.copy(deep=True)

        return self

    def search_in_dict(self, data):
        for key, value in self.inv_words_dict_.items():
            for i in value:
                if i == data:
                    return key
                else:
                    pass
        return data

    def inv_words_funct(self):
        column_name = [
            'Tipo', 'Tipo_2', 'Marca', 'Empaque', 'UnidadMedida', 'Contenido'
        ]
        for element in column_name:
            self.df_[element] = self.df_.apply(
                lambda row: self.search_in_dict(row[element]), axis=1)
        return self

    def drop_unused_columns(self):
        columns_to_drop = [
            'descripcion', 'producto', 'LocalidadGeografica', 'Tipo_3',
            'Tipo_4'
        ]
        self.df_.drop(columns_to_drop, axis=1, inplace=True)

        return self
dir_name = './res_reddit/'
news_dir = ''
news2 = pd.read_csv(news_dir + 'cooc21weighted.csv', index_col=0)
news1 = pd.read_csv(news_dir + 'cooc11weighted.csv', index_col=0)
st = SnowballStemmer('english')

for filename in os.listdir(dir_name):
    if 'count_term_500' in filename:
        print(filename)
    else:
        continue

    term = filename.split('_child_')[1][:-4]
    term_clean = ' '.join(term.split('_'))
    term_stemmed = ' '.join([st.stem(x) for x in term.split('_')])
    red = pd.read_csv(dir_name + 'count_term_500_child_' + term + '.csv',
                      index_col=0)

    if len(term.split('_')) == 2:
        news_words = news2
    else:
        news_words = news1

    if term_stemmed + '_count_freq_weighted' not in news_words.columns:
        continue
    news_words = news_words[[term_stemmed + '_count_freq_weighted']]
    news_words.columns = ['news']
    news_words['news_rank'] = news_words['news'].rank(method='min',
                                                      ascending=False)
Exemplo n.º 43
0
import nltk
from nltk.stem import RegexpStemmer

stemmerregexp = RegexpStemmer('ing')
stemmerregexp.stem('running')

# ### II. SNOWBALL STEMMER

# In[7]:

import nltk
from nltk.stem import SnowballStemmer

SnowballStemmer.languages
frstemmer = SnowballStemmer('french')
frstemmer.stem('manges')

# ### III. LANCASTER STEMMER

# In[8]:

import nltk
from nltk.stem import LancasterStemmer

lancaster = LancasterStemmer()
lancaster.stem('running')

# ### IV. PORTEWR STEMMER

# In[10]:
Exemplo n.º 44
0
print(list(newsgroups_train.target_names))

# Lets look at some sample news
k = newsgroups_train.data[:2]
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)

print(WordNetLemmatizer().lemmatize('went',
                                    pos='v'))  # past tense to present tense

stemmer = SnowballStemmer("english")
original_words = [
    'caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned',
    'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization',
    'sensational', 'traditional', 'reference', 'colonizer', 'plotted'
]
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word': original_words, 'stemmed': singles})


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


# Tokenize and lemmatize
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                token) > 3:
            result.append(lemmatize_stemming(token))
Exemplo n.º 45
0
tokenized_sentences = nltk.sent_tokenize(text)

print(tokenized_sentences)

# Tokenizar palabras
tokenized_words = [
    nltk.word_tokenize(sentence) for sentence in tokenized_sentences
]

print(tokenized_words)

# POS Tagger
tokens_pos = [pos_tagger.tag(word) for word in tokenized_words]
print(tokens_pos)

# Stemmas
stemmas = [stemmer.stem(word) for word in tokenized_words[0]]
print(stemmas)

# NER Tagger
entities = [ner_tagger.tag(word) for word in tokenized_words]
print(entities)

# Stop words
print(stop_words)

# Removing using stop words
# for word in tokenized_words:
#     if word not in tokenized_words:
#         print(word)
Exemplo n.º 46
0
def stemning(words):
    stemmer = SnowballStemmer('danish')
    return [stemmer.stem(word) for word in words]
Exemplo n.º 47
0
def stemming_tweets(tweet):
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in tweet]
    return stemmed_words
Exemplo n.º 48
0
    for word in words:
        if dicts.has_key(word.encode('utf8')):
            tag = sorted(dicts[word.encode('utf8')].items(),
                         key=lambda x: x[1],
                         reverse=True)[0][0]
            tagged_text.append(word + ' ' + tag)
        else:
            tagged_text.append(word + ' ' + 'NN')

# Create frequence list
freq_list = dict()
for word in tagged_text:
    tokens = nltk.word_tokenize(word)
    if tokens[0] not in stopwords.words(
            'swedish') and tokens[1] != 'MAD' and tokens[1] != 'MID':
        if freq_list.has_key(stemmer.stem(tokens[0])):
            freq_list[stemmer.stem(tokens[0])] += 1
        else:
            freq_list[stemmer.stem(tokens[0])] = 1

# Dela alla varden i frekvenslistan med hogsta antalet frekvens for att fa varde mellan 0-1
highest_value = sorted(freq_list.items(), key=lambda x: x[1],
                       reverse=True)[0][1]
for word in freq_list:
    freq_list[word] = freq_list[word] / highest_value

# Kor alla meningar mot frekvenslistan (orden stemmade) och tilldela meningarna en score
ordered = []
for sentence in sentences:
    score = 0
    tokens = nltk.word_tokenize(sentence)
final_table["all_reviews_string"] = ''
snowball_stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()

for i in range(len(final_table)):
    review = final_table["all_reviews_text"][i]
    #print review
    if review.__str__() != 'nan':
        str1 = ''.join(review)
        #lemmetize it
        #wordnet_lemmatizer.lemmatize
        documents = " ".join(
            [wordnet_lemmatizer.lemmatize(word) for word in str1.split(" ")])
        #stem it
        documents = " ".join(
            [snowball_stemmer.stem(word) for word in documents.split(" ")])
        print documents
        final_table["all_reviews_string"][i] = documents

stop = stopwords.words('english')
stop.remove(u'no')
stop.remove(u'nor')
stop.remove(u'not')
stop.append(u'let')
stop.append(u'anyway')
stop.append(u'else')
stop.append(u'maybe')
stop.append(u'however')
stop.append(u'00')
stop.append(u'10')
stop.append(u'11')
Exemplo n.º 50
0
    
    for file in files:
        file_path = count_to_file[file]
        file_id = file_to_count[file_path]
    
        lines = ""
        
        f = open(file_path, 'r')

        local_freq_dic = {} #store only the freq of words appearing in one file

        lines = f.read()

        words = tokenizer.tokenize(lines) #tokenize
        words = [word.strip('_ ').lower() for word in words if word not in STOPWORDS] #lowercase 
        words = [stemmer.stem(word) for word in words] #stemmer
        words = [word for word in words if word not in STOPWORDS and len(word) > 0] #stopword removal

        for word in words:
            all_words[cls].add(word)
            
        for word in words:
            if word not in local_freq_dic:
                local_freq_dic[word] = 0 #initialize
            local_freq_dic[word] += 1

        for word in local_freq_dic:
            if word not in train_tfidf_dic:
                train_tfidf_dic[word] = {} #initialize
            train_tfidf_dic[word][file_id] = local_freq_dic[word]
Exemplo n.º 51
0
json_data = None
with open(
        'C:/Users/ABHISEK/Downloads/yelp_academic_dataset_review.json/yelp_academic_dataset_review.json'
) as data_file:
    lines = data_file.readlines()
    joined_lines = "[" + ",".join(lines) + "]"

    json_data = j.loads(joined_lines)

data = pd.DataFrame(json_data)

stemmer = SnowballStemmer('english')
words = stopwords.words("english")

data['cleaned'] = data['text'].apply(lambda x: " ".join([
    stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split()
    if i not in words
]).lower())

X_train, X_test, y_train, y_test = train_test_split(data['cleaned'],
                                                    data.stars,
                                                    test_size=0.2)

pipeline = Pipeline([('vect',
                      TfidfVectorizer(ngram_range=(1, 2),
                                      stop_words="english",
                                      sublinear_tf=True)),
                     ('chi', SelectKBest(chi2, k=10000)),
                     ('clf',
                      LinearSVC(C=1.0, penalty='l1', max_iter=3000,
                                dual=False))])
Exemplo n.º 52
0
def jamesLemmatize(tokens, doStemDic):
    '''
    This method is used to lemmatize and stem text for topic modeling
    It is used by preProcess and preProcessSentence above

    Parameters
    ----------
            tokens: list or str
                    the input to be lemmatized and stemmed
                    either a string that has not yet been tokenized, or a list representing an
                    already tokenized string

            doStemDic: bool
                    a setting for whether or not a word stem to word dictionary should be
                    constructed during lemmatization and returned with the results

    Output
    ------
            dict
                    a dictionary containing the results of lemmatization
                    if doStemDic is true, the dictionary will have two keys:
                    "lemmatized" which has a list representing the lemmatized input as a value,
                    and "stemDic" which has the word stem to word dictionary as a value
                    otherwise, the dictionary will have only have the lemmatized key and value
    '''
    # Initialize the objects to be returned, if needed
    lemmatized = []
    if doStemDic:
        stemDic = {}
    # Initialize a WordNetLemmatizer, imported from nltk.stem
    lemmatizer = WordNetLemmatizer()
    # Initialize a SnowballStemmer in english, imported from nltk.stem
    stemmer = SnowballStemmer('english')
    # Remove apostrophes and text following before lemmatizing
    for token in tokens:
        token = re.sub("\'[a-zA-Z]*",'',token)
    # Tokenize the text using simple_preprocess, imported from gensim.utils, if needed
    if type(tokens) == str:
        tokens = simple_preprocess(tokens)
    # Tag each word using pos_tag, imported from nltk.tag, and iterate through each token and tag
    for token, tag in pos_tag(tokens):
        # Format it to lowercase
        token = token.lower()
        # Check whether the token is tagged as a noun, a verb, or other, and set pos appropriately
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        # Filter out each token that is punctuation, in STOPWORDS (imported from
        #   gensim.parsing.preprocessing), or is shorter than the minimum acceptable token length
        if token not in STOPWORDS and token not in string.punctuation and len(token) >= cfg['mintokenlen']:
            # Lemmatize the token using WordNetLemmatizer
            lemma = lemmatizer.lemmatize(token, pos)
            # Stem the token using the SnowballStemmer
            lemma = stemmer.stem(lemma)
            # Add the the stem to the stem dictionary as a key with a value of the lemma which produced
            #   the stem if the stem is not already in the stem dictionary, and if a stem dictionary
            #   is needed
            if doStemDic:
                if lemma not in stemDic:
                    stemDic[lemma] = token
            # Add the lemma to the lemmatized list
            lemmatized.append(lemma)

    # If a stem dictionary is required, return a dictionary containing the stems list and stem dictionary
    if doStemDic:
        return {"lemmatized": lemmatized, "stemDic": stemDic}
    # Otherwise, return a dictionary with only the lemmatized list
    return {"lemmatized": lemmatized}
Exemplo n.º 53
0
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)
    text = re.sub(r"demonitization", "demonetization", text)
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text)
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text)
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)

    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])

    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)
Exemplo n.º 54
0
class Processing:
    def __init__(self,
                 stopwords_path='data/',
                 tokenizer_path='models/',
                 max_len=80):
        # It needs a stopwords file to init
        stop_words = pd.read_csv(stopwords_path + 'stopwords-es.txt',
                                 header=None)
        stop_words = stop_words[0].tolist() + ['secuela']
        self.stop_words = stop_words
        self.n_words = 8000
        self.max_len = max_len
        # self.aug = naf.Sequential([
        #    naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-cased', action="insert", aug_p=0.1),
        #    naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-cased', action="substitute", aug_p=0.9),
        #    naw.RandomWordAug(action="delete", aug_p=0.1)
        # ])

        try:
            self.stemmer = SnowballStemmer("spanish", ignore_stopwords=True)
        except:
            nltk.download("popular")
            self.stemmer = SnowballStemmer("spanish", ignore_stopwords=True)

        # loading
        with open(tokenizer_path + 'tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        self.__vocab_size = len(self.tokenizer.word_index) + 1

    @property
    def vocab_size(self):
        return self.__vocab_size

    def normalize(self, s):
        s = s.lower()
        replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"),
                        ("ú", "u"), ("ñ", "n"))
        for a, b in replacements:
            s = s.replace(a, b).replace(a.upper(), b.upper())

        return s

    def split_punt(self, x):
        words = WordPunctTokenizer().tokenize(x)
        x = str(' '.join(words))
        x = re.sub(' +', ' ', x)

        return x

    def delete_stop_words(self, x):
        x = x.translate(str.maketrans('', '', string.punctuation))
        x = x.translate(str.maketrans('', '', '1234567890ªº¡¿'))
        words = x.split(' ')
        words = [word for word in words if word not in self.stop_words]
        x = str(' '.join(words))

        return x

    def stem_sentence(self, sentence):
        # Stem the sentence
        stemmed_text = [
            self.stemmer.stem(word) for word in word_tokenize(sentence)
        ]

        return " ".join(stemmed_text)

    def augment(self, x):
        try:
            return self.aug.augment(x)
        except:
            return None

    def clean_overview(self, df):
        # Execute the full cleaning process into every overview
        df['overview'] = df['overview'].apply(lambda x: self.normalize(x))
        df['overview'] = df['overview'].apply(
            lambda x: self.delete_stop_words(x))
        df['overview'] = df['overview'].apply(lambda x: self.stem_sentence(x))
        df['overview'] = df.apply(
            lambda x: self.get_actors(x['cast']) + ' ' + x['overview'], axis=1)
        df['overview'] = df.apply(
            lambda x: self.get_director(x['crew']) + x['overview'], axis=1)
        df['overview'] = df['overview'].apply(lambda x: self.normalize(x))
        df['overview'] = df['overview'].apply(
            lambda x: self.delete_stop_words(x))

        return df

    # Get staff and paste to overview
    @staticmethod
    def eval_cell(cell):

        try:

            cell_array = eval(cell)

        except:

            cell_array = []

        return cell_array

    def get_actors(self, cast):

        eval_cast = self.eval_cell(cast)

        if len(eval_cast) > 2:
            up = 3
        else:
            up = len(eval_cast)

        actors = ''

        for i in range(0, up):
            actor = eval_cast[i]['name']
            actor = self.normalize(actor.replace(' ', '_').lower())

            actors = actors + ' ' + actor

        return actors

    def get_director(self, crew):

        eval_crew = self.eval_cell(crew)

        directors = [
            member['name'] for member in eval_crew
            if member['job'] == 'Director'
        ]
        directors = [
            self.normalize(director.replace(' ', '_').lower())
            for director in directors
        ]
        directors = str(' '.join(directors))

        return directors

    def paste_cast(self, data):

        data['overview'] = data.apply(
            lambda x: self.get_actors(x['cast']) + ' ' + x['overview'], axis=1)
        data['overview'] = data.apply(
            lambda x: self.get_director(x['crew']) + x['overview'], axis=1)

        return data

    # Split train_test
    def split_data(self, data):

        overviews = data['overview'].values
        y = data['like'].values

        overviews_train, overviews_test, y_train, y_test = train_test_split(
            overviews, y, test_size=0.15, stratify=y, random_state=9)

        return overviews_train, overviews_test, y_train, y_test

    def fit_tokenizer(self, overviews_train, num_words):
        self.tokenizer = Tokenizer(num_words)
        self.tokenizer.fit_on_texts(overviews_train)
        # Adding 1 because of reserved 0 index
        self.vocab_size = len(self.tokenizer.word_index) + 1

    def tokenize_overview(self, overviews, max_len):

        X = self.tokenizer.texts_to_sequences(overviews)
        # print(len(max(X, key=len)))
        from keras.preprocessing.sequence import pad_sequences

        # We pad the sentence for the left to fit with max_len
        X = pad_sequences(X, padding='pre', maxlen=max_len)
        # print(X[1])

        return X

    def process(self, data, train_dev):

        df = self.clean_overview(data)
        df = self.paste_cast(df)

        if train_dev:

            X_train, X_test, y_train, y_test = self.split_data(df)

            self.fit_tokenizer(X_train, self.n_words)
            X_train = self.tokenize_overview(X_train, self.max_len)
            X_test = self.tokenize_overview(X_test, self.max_len)

            return X_train, X_test

        else:

            X = df['overview'].values
            X = self.tokenize_overview(X, self.max_len)

            return X
Exemplo n.º 55
0
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Importing the dataset
col_names = ['Rating', 'Review']
dataset = pd.read_csv('data/train.csv', names=col_names, header=None)
del col_names

#======================================================================================================
# Cleaning the texts

corpus = []
for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = SnowballStemmer('english')
    review = [
        ps.stem(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)
del review, i

#======================================================================================================

X = corpus
y = dataset.iloc[0:len(corpus), 0].values
#X_train, X_val, y_train, y_val = train_test_split(corpus, y, test_size = 0.10, random_state = 0)


def decrement(list):
    return [x - 1 for x in list]
def stemTokenize_1(text):
 stemmer=SnowballStemmer('danish')
 return[stemmer.stem(w)for w in word_tokenize(text)]
Exemplo n.º 58
0
def stem_tokens(t_tokens, lang):
    stemmer = SnowballStemmer(language=lang)
    t_tokens = [stemmer.stem(item) for item in t_tokens]
    return t_tokens
Exemplo n.º 59
0
from nltk.stem import SnowballStemmer
from nltk.stem.snowball import EnglishStemmer

stemmer = SnowballStemmer("english")
stemmer = EnglishStemmer()

TEXT = "Once upon a time there lived in a certain village a little country girl, " + \
       "the prettiest creature who was ever seen. Her mother was excessively fond of her; " + \
       "and her grandmother doted on her still more. This good woman had a little red riding hood made for her. " + \
       "It suited the girl so extremely well that everybody called her Little Red Riding Hood."

stemmed_text = [stemmer.stem(word) for word in TEXT.split()]
print(stemmed_text)
Exemplo n.º 60
0
        if subject not in documents:
            documents[subject] = 1
        else:
            documents[subject] += 1

        if subject not in totals:
            totals[subject] = 0

        if subject not in texts:
            texts[subject] = {}

        for word in words[1:]:
            # Only include words greater than 2 letters, after being stemmed
            if len(word) > 2:
                word = stemmer.stem(word)
                totals[subject] += 1
                vocabulary.add(word)
                if word not in texts[subject]:
                    texts[subject][word] = 1
                else:
                    texts[subject][word] += 1

    # Also used in Bayes equation
    vocabularySize = len(vocabulary)
    print('%i distinct words found in the training data' % vocabularySize)

    print('Opening test data file')
    with open(sys.argv[2]) as f:
        print('Reading test data file')
        testlines = f.readlines()