示例#1
0
    def preprocess_all(self, raw_files):
        '''
        Converts the raw files into the documents.
        For each document creates tokens and bag of words.

        Args:
            raw_files: dictionary[document_key] = document_content
        '''
        logger.info("Preprocessing...")
        self.documents = preprocess_all(raw_files)
        self.docs_bag = bag_of_documents(self.documents)
示例#2
0
    def preprocess_all(self, raw_files):
        '''
        Converts the raw files into the documents.

        Procedure:
            1. creates documents
            2. gets number of documents
            3. for each term determine all documents in which
               that term exists and how many times it occurs (docs_bag)

        Args:
            raw_files: dict (key = document key, value = document text)
        '''
        logger.info("Preprocessing...")
        self.documents = preprocess_all(raw_files)
        self.docs_no = len(self.documents)
        self.docs_bag = bag_of_documents(self.documents)
示例#3
0
    def preprocess_all(self, raw_files):
        '''
        Converts the raw files into the documents.

        Procedure:
            1. creates documents
            2. gets number of documents
            3. for each term determine all documents in which
               that term exists and how many times it occurs (docs_bag)

        Args:
            raw_files: dict (key = document key, value = document text)
        '''
        logger.info("Preprocessing...")
        self.documents = preprocess_all(raw_files)
        self.docs_no = len(self.documents)
        self.docs_bag = bag_of_documents(self.documents)
示例#4
0
    def preprocess_all(self, raw_files):
        '''
        Converts the raw files into the documents and
        executes preprocess algorithm on all of them.

        Calculates tf, idf, tf * idf.

        Args:
            raw_files: dict[identifier] = text
        '''
        logger.info("Preprocessing...")
        self.documents = preprocess_all(raw_files)
        self.docs_no = len(self.documents)

        self.determine_idf()
        self.determine_tf()
        self.tf_idf = self.tf.multiply(self.idf)
示例#5
0
    def preprocess_all(self, raw_files):
        '''
        Converts the raw files into the documents and
        executes preprocess algorithm on all of them.

        Calculates tf, idf, tf * idf.

        Args:
            raw_files: dict[identifier] = text
        '''
        logger.info("Preprocessing...")
        self.documents = preprocess_all(raw_files)
        self.docs_no = len(self.documents)

        self.determine_idf()
        self.determine_tf()
        self.tf_idf = self.tf.multiply(self.idf)