示例#1
0
def test_get_all_files_from_directory():
    root_dir = os.path.dirname(os.path.abspath(__file__))
    directory = 'test_resources/documents'
    path = os.path.join(root_dir, directory)
    
    files = basic_tools.get_all_files_from_directory(path)
    assert os.path.join(path, 'test1.txt') in files
    assert os.path.join(path, 'test2.txt') in files
    
    path2 = os.path.join(path, 'test_directory')
    files2 = basic_tools.get_all_files_from_directory(path, True)
    print(files2)
    assert os.path.join(path, 'test1.txt') in files2
    assert os.path.join(path, 'test2.txt') in files2
    assert os.path.join(path2, 'test3.txt') in files2
    assert os.path.join(path2, 'test_dir2/test4.txt') in files2
示例#2
0
    def __init__(self, dataset_directory, is_recursive=True):
        """
        dataset_directory -- a relative or absolute file path to the \
        directory that contains the documents directory and the dataset_metadata.txt file.
        is_recursive -- find documents recursively in the documents directory, \
                        by default documents will be found recursively.
        Document metadata types will be infered from the document metadata.
        """
        # create commonly used directory and file paths
        self._dataset_directory = dataset_directory
        self._abs_dataset_directory = os.path.abspath(dataset_directory)
        self._metadata_file = os.path.join(self._abs_dataset_directory,
                                           'dataset_metadata.txt')
        self._documents_directory = os.path.join(self._abs_dataset_directory,
                                                 'documents')
        self.is_recursive = is_recursive
        self._filters = []

        self.metadata = {}
        # load the dataset metadata
        with io.open(self._metadata_file,
                     'r',
                     encoding='utf-8',
                     errors='ignore') as meta_file:
            content = meta_file.read()
            metadata, __ = basic_tools.seperate_metadata_and_content(content)
            self.metadata = basic_tools.metadata_to_dict(metadata)
        self.metadata_types = {}
        basic_tools.collect_types(self.metadata_types, self.metadata)

        if not 'readable_name' in self.metadata:
            identifier = self._dataset_directory.replace('_', ' ').replace(
                '/', ' ').title()
        else:
            identifier = self.metadata['readable_name']
        self.name = basic_tools.remove_punctuation(identifier)

        # find and sort all file paths
        self._list_of_documents = basic_tools.get_all_files_from_directory(
            self._documents_directory, self.is_recursive)
        self._list_of_documents.sort()

        # find any bad documents and find document metadata
        self.document_metadata_types = {}
        bad_doc_indices = []
        for doc_index, doc in enumerate(self):
            try:
                basic_tools.collect_types(self.document_metadata_types,
                                          doc.metadata)
            except Exception as e:
                print("Bad document: ", self._list_of_documents[doc_index])
                bad_doc_indices.append(doc_index)
        while len(bad_doc_indices) != 0:
            remove_index = bad_doc_indices.pop()
            del self._list_of_documents[remove_index]
 def __init__(self, dataset_directory, is_recursive=True):
     """
     dataset_directory -- a relative or absolute file path to the \
     directory that contains the documents directory and the dataset_metadata.txt file.
     is_recursive -- find documents recursively in the documents directory, \
                     by default documents will be found recursively.
     Document metadata types will be infered from the document metadata.
     """
     # create commonly used directory and file paths
     self._dataset_directory = dataset_directory
     self._abs_dataset_directory = os.path.abspath(dataset_directory)
     self._metadata_file = os.path.join(self._abs_dataset_directory,
                                        'dataset_metadata.txt')
     self._documents_directory = os.path.join(self._abs_dataset_directory, 
                                              'documents')
     self.is_recursive = is_recursive
     self._filters = []
     
     self.metadata = {}
     # load the dataset metadata
     with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file:
         content = meta_file.read()
         metadata, __ = basic_tools.seperate_metadata_and_content(content)
         self.metadata = basic_tools.metadata_to_dict(metadata)
     self.metadata_types = {}
     basic_tools.collect_types(self.metadata_types, self.metadata)
     
     if not 'readable_name' in self.metadata:
         identifier = self._dataset_directory.replace('_', ' ').replace('/', ' ').title()
     else:
         identifier = self.metadata['readable_name']
     self.name = basic_tools.remove_punctuation(identifier)
     
     # find and sort all file paths
     self._list_of_documents = basic_tools.get_all_files_from_directory(self._documents_directory, self.is_recursive)
     self._list_of_documents.sort()
     
     # find any bad documents and find document metadata
     self.document_metadata_types = {}
     bad_doc_indices = []
     for doc_index, doc in enumerate(self):
         try:
             basic_tools.collect_types(self.document_metadata_types, doc.metadata)
         except Exception as e:
             print("Bad document: ", self._list_of_documents[doc_index])
             bad_doc_indices.append(doc_index)
     while len(bad_doc_indices) != 0:
         remove_index = bad_doc_indices.pop()
         del self._list_of_documents[remove_index]