/
wiki_preprocessor.py
169 lines (119 loc) · 4.18 KB
/
wiki_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# -*- coding: utf-8 -*-
#############################################################
#
# TUM Informatics
# Master Thesis Project: Indexing Methods for Social Search
#
# Author: Oriana Baldizan
# Date: October 2015
#
# wiki_preprocessor.py: Defines the pre-processing methods
# of the wiki data
#
# - Removes stop words
# - Stemming
# - Removes very common and very uncommon words
#
#############################################################
# Python Modules
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from gensim import corpora, utils
import logging
import unicodedata
# Global Variables
stop_words = stopwords.words("english")
wnl = WordNetLemmatizer() # lematize(word)
stl = LancasterStemmer() # stem(word)
stp = PorterStemmer()
class WikiDocument (object):
""" Represents a Wikipedia article """
def __init__(self, raw_data, not_raw=False):
self.document_id = None
self.content = ""
self.title = ""
self.url = ""
if not_raw:
self._create_basic_document(raw_data[0], raw_data[1])
else:
self._create_document(raw_data)
def _create_basic_document(self, id, content):
self.document_id = id
self.content = content
def _create_document(self, raw_data):
for element in raw_data:
if element.startswith("<doc"):
# <doc id="46571441" url="https://en.wikipedia.org/wiki?curid=46571441" title="The bacterial, archaeal and plant plastid code">
# Eliminar <doc, > y "
element = element.replace("<doc ", "")
element = element.replace("\"", "")
element = element.replace(">", "")
# Extract document id
end = element.find(" url=")
self.document_id = element[3:end]
# Extract document url
start = element.find("url=") + 4
end = element.find(" id=")
self.url = element[start:end]
# Extract document title
start = element.find("title=") + 6
self.title = element[start:]
elif element != "": # Ignore empty lines
self.content += " " + element
class WikiPreprocessor (object):
""" Preprocessor for Wikipedia data """
def __init__(self, setting):
self.setting = setting
self.corpus = []
self.vocabulary = corpora.Dictionary() #set
def add_documents(self, documents):
""" Add list of documents to vocabulary """
self.vocabulary.add_documents(documents)
def clean_documents(self, raw_documents):
logging.info("Distributing work ...")
p_pool = Pool()
clean_documents = p_pool.imap(self._create_clean_document, raw_documents)
p_pool.close()
p_pool.join()
return clean_documents
def create_clean_document(self, document):
#logging.info("\nCleaning document ...")
clean_document = document
clean_document.content = self.remove_special_characters(clean_document.content)
# Convert document into tokens and clean it
tokens = utils.simple_preprocess(clean_document.content)
clean_content = self.remove_stop_words(tokens)
clean_document.content = ' '.join(clean_content)
return clean_document
def remove_special_characters(self, text):
""" Replaces the accents and special characters in text """
nkfd_form = unicodedata.normalize('NFKD', unicode(text)).encode('ASCII', 'ignore')
return nkfd_form
def create_clean_corpus(self, raw_data):
logging.info("\nCleaning data ...")
for document in raw_data:
# Convert document into tokens
self.corpus.append( utils.simple_preprocess(document) )
logging.info("\nRemoving short stop words ...")
for (index, document) in enumerate(self.corpus):
self.corpus[index] = self.remove_stop_words(document)
logging.info("\nRetrieving vocabulary ...")
self.vocabulary = corpora.Dictionary(self.corpus)
@staticmethod
def remove_stop_words(tokens):
""" Removes stopwords and stems the rest of the words """
result = []
for token in tokens:
if token not in stop_words:
stem_token = stp.stem(token)
if len(stem_token) > 2:
result.append(stem_token)
return result
@staticmethod
def get_dictionary_from_corpora(corpuses):
combined_corpora = []
for corpus in corpuses:
combined_corpora += corpus
return corpora.Dictionary(combined_corpora)