Exemplo n.º 1
0
    def _load_file(self, _file, _class, rem_stopwords, stem):
        """
        Implementation of method that opens file, tokenizes it and adds it to
        the corpus.
        """
        #print _file, _class
        in_file = open(_file, 'r')
        text = in_file.read()
        in_file.close()        
        text = text.split()        
        tokens = []        
        tok = Tokenizer()
              
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                  
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        for word in text:
            tokens.extend(tok.fineTokenization(word)) 
        token_dict = {}        
        for token in tokens:
            try:
                token_dict[token.lower()] += 1
            except KeyError:
                token_dict[token.lower()] = 1
        #document is a CorpusDocument object. The docid is the path to the file
        #(_file).
        document = CorpusDocument(_file)
        for key, value in token_dict.iteritems():
            if not (rem_stopwords and key in stopwords):
            
                if stemmer != None:
                    key = stemmer.stem(key)
                
                if key != None:
                	document[self._add_word_to_lex(key)] = value
                
        if self.insert_document(document, _class):
            self._file_number += 1
        else:
            self._repeated_files += 1
Exemplo n.º 2
0
    def load(self, _file, rem_stopwords = True, stem=True, merge = True, \
             class_name = None):
        """
        Abstract method implementation for the txt format
        """
        in_file = open(_file, 'r')
        text = in_file.read()
        in_file.close()     
        text = text.split()        
        tokens = []        
       	tok = Tokenizer()
        
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                     
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        for word in text:
            tokens.extend(tok.fineTokenization(word))            
        token_dict = {}        
        for token in tokens:
            try:
                token_dict[token.lower()] += 1
            except KeyError:
                token_dict[token.lower()] = 1    
        document = CorpusDocument(_file)
        for key, value in token_dict.iteritems():
            if not (rem_stopwords and key in stopwords):
            
                if stemmer != None:
                    key = stemmer.stem(key)
                
                if key != None:
                	document[self._add_word_to_lex(key)] = value
                    
        if not merge:
            self.clear()
            self.insert_document(document, class_name)
        else:
            self.insert_document(document, class_name)
Exemplo n.º 3
0
    def _load_file(self, _file, _class, rem_stopwords, stem):
        """
        Implementation of method that opens file, tokenizes it and adds it to
        the corpus.
        
        @param _file: file to be loaded
        @param _class: class of the file
        """
        #initialization
        dom = parse(_file)
        filhos = dom.childNodes[0].childNodes
        
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                     
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        body_data = "" 
        i = 3
        while i < (len( filhos )- 1):
            body_data = filhos[i].getElementsByTagName("Resenha")[0].childNodes[0].data
            text = body_data.split()        
            tokens = []
            tok = Tokenizer()        
            for word in text:
                tokens.extend(tok.fineTokenization(word))            
            token_dict = {}
            for token in tokens:
                try:
                    token_dict[token.lower()] += 1
                except KeyError:
                    token_dict[token.lower()] = 1
            #document is a CorpusDocument object. The docid is the path to the file
            #(_file).
            document = CorpusDocument(filhos[i].getAttribute('id'))
            
            for key, value in token_dict.iteritems():
                if not (rem_stopwords and key in stopwords):
            
                    if stemmer != None:
                        key = stemmer.stem(key)
                
                    if key != None:
                	    document[self._add_word_to_lex(key)] = value
                    
            if self.insert_document(document, _class):
                self._file_number += 1
            else:
                self._repeated_files += 1
            body_data = ""
            i += 2
Exemplo n.º 4
0
    def _load_file(self, _file, _class, rem_stopwords, stem):
        """
        Implementation of method that opens file, tokenizes it and adds it to
        the corpus.
        
        @param _file: file to be loaded
        @param _class: class of the file
        """
        #initialization
        handlerbody = ReutersHandler("BODY")
        parserbody = make_parser()
        parserbody.setContentHandler(handlerbody)
        parserbody.parse(_file)
        
	    ##############Stopword removal############################
        stopwordFile = "stopwords" + self._lang                   
        f = open(os.path.join(os.environ['TCLASS'], "tclass", "corpus", stopwordFile), 'rt')
        stopwords = f.read()
        f.close()
        stopwords = stopwords.split()
	    ##########################################################

        #################Stem setup###############################
        stemmer = None
        if stem:

            if self._lang == 'Pt':
                stemmer = RSLP()
            elif self._lang == 'En':
	            stemmer = PorterStemmer()
        ##########################################################
        
        body_data = "" 
        for i in range(len(handlerbody.LABEL)):
            body_data = str(handlerbody.LABEL[i])
            text = body_data.split()        
            tokens = []        
            tok = Tokenizer()        
            for word in text:
                tokens.extend(tok.fineTokenization(word))            
            token_dict = {}
            for token in tokens:
                try:
                    token_dict[token.lower()] += 1
                except KeyError:
                    token_dict[token.lower()] = 1
            #document is a CorpusDocument object. The docid is the path to the
            #file (_file).
            document = CorpusDocument(_file)
            for key, value in token_dict.iteritems():
                if not (rem_stopwords and key in stopwords):
            
                    if stemmer != None:
                        key = stemmer.stem(key)
                
                    if key != None:
                	    document[self._add_word_to_lex(key)] = value
                    
            if self.insert_document(document, _class):
                self._file_number += 1
            else:
                self._repeated_files += 1
            body_data = ""