def _load_file(self, _file, _class, rem_stopwords, stem): """ Implementation of method that opens file, tokenizes it and adds it to the corpus. """ #print _file, _class in_file = open(_file, 'r') text = in_file.read() in_file.close() text = text.split() tokens = [] tok = Tokenizer() ##############Stopword removal############################ stopwordFile = "stopwords" + self._lang f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt') stopwords = f.read() f.close() stopwords = stopwords.split() ########################################################## #################Stem setup############################### stemmer = None if stem: if self._lang == 'Pt': stemmer = RSLP() elif self._lang == 'En': stemmer = PorterStemmer() ########################################################## for word in text: tokens.extend(tok.fineTokenization(word)) token_dict = {} for token in tokens: try: token_dict[token.lower()] += 1 except KeyError: token_dict[token.lower()] = 1 #document is a CorpusDocument object. The docid is the path to the file #(_file). document = CorpusDocument(_file) for key, value in token_dict.iteritems(): if not (rem_stopwords and key in stopwords): if stemmer != None: key = stemmer.stem(key) if key != None: document[self._add_word_to_lex(key)] = value if self.insert_document(document, _class): self._file_number += 1 else: self._repeated_files += 1
def load(self, _file, rem_stopwords = True, stem=True, merge = True, \ class_name = None): """ Abstract method implementation for the txt format """ in_file = open(_file, 'r') text = in_file.read() in_file.close() text = text.split() tokens = [] tok = Tokenizer() ##############Stopword removal############################ stopwordFile = "stopwords" + self._lang f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt') stopwords = f.read() f.close() stopwords = stopwords.split() ########################################################## #################Stem setup############################### stemmer = None if stem: if self._lang == 'Pt': stemmer = RSLP() elif self._lang == 'En': stemmer = PorterStemmer() ########################################################## for word in text: tokens.extend(tok.fineTokenization(word)) token_dict = {} for token in tokens: try: token_dict[token.lower()] += 1 except KeyError: token_dict[token.lower()] = 1 document = CorpusDocument(_file) for key, value in token_dict.iteritems(): if not (rem_stopwords and key in stopwords): if stemmer != None: key = stemmer.stem(key) if key != None: document[self._add_word_to_lex(key)] = value if not merge: self.clear() self.insert_document(document, class_name) else: self.insert_document(document, class_name)
def _load_file(self, _file, _class, rem_stopwords, stem): """ Implementation of method that opens file, tokenizes it and adds it to the corpus. @param _file: file to be loaded @param _class: class of the file """ #initialization dom = parse(_file) filhos = dom.childNodes[0].childNodes ##############Stopword removal############################ stopwordFile = "stopwords" + self._lang f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt') stopwords = f.read() f.close() stopwords = stopwords.split() ########################################################## #################Stem setup############################### stemmer = None if stem: if self._lang == 'Pt': stemmer = RSLP() elif self._lang == 'En': stemmer = PorterStemmer() ########################################################## body_data = "" i = 3 while i < (len( filhos )- 1): body_data = filhos[i].getElementsByTagName("Resenha")[0].childNodes[0].data text = body_data.split() tokens = [] tok = Tokenizer() for word in text: tokens.extend(tok.fineTokenization(word)) token_dict = {} for token in tokens: try: token_dict[token.lower()] += 1 except KeyError: token_dict[token.lower()] = 1 #document is a CorpusDocument object. The docid is the path to the file #(_file). document = CorpusDocument(filhos[i].getAttribute('id')) for key, value in token_dict.iteritems(): if not (rem_stopwords and key in stopwords): if stemmer != None: key = stemmer.stem(key) if key != None: document[self._add_word_to_lex(key)] = value if self.insert_document(document, _class): self._file_number += 1 else: self._repeated_files += 1 body_data = "" i += 2
def _load_file(self, _file, _class, rem_stopwords, stem): """ Implementation of method that opens file, tokenizes it and adds it to the corpus. @param _file: file to be loaded @param _class: class of the file """ #initialization handlerbody = ReutersHandler("BODY") parserbody = make_parser() parserbody.setContentHandler(handlerbody) parserbody.parse(_file) ##############Stopword removal############################ stopwordFile = "stopwords" + self._lang f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt') stopwords = f.read() f.close() stopwords = stopwords.split() ########################################################## #################Stem setup############################### stemmer = None if stem: if self._lang == 'Pt': stemmer = RSLP() elif self._lang == 'En': stemmer = PorterStemmer() ########################################################## body_data = "" for i in range(len(handlerbody.LABEL)): body_data = str(handlerbody.LABEL[i]) text = body_data.split() tokens = [] tok = Tokenizer() for word in text: tokens.extend(tok.fineTokenization(word)) token_dict = {} for token in tokens: try: token_dict[token.lower()] += 1 except KeyError: token_dict[token.lower()] = 1 #document is a CorpusDocument object. The docid is the path to the #file (_file). document = CorpusDocument(_file) for key, value in token_dict.iteritems(): if not (rem_stopwords and key in stopwords): if stemmer != None: key = stemmer.stem(key) if key != None: document[self._add_word_to_lex(key)] = value if self.insert_document(document, _class): self._file_number += 1 else: self._repeated_files += 1 body_data = ""