Python Dataset 예제들, nalaf.structures.data.Dataset Python 예제들

예제 #1

0

파일 보기

def filter_only_full_text(corpus):
    newcorpus = Dataset()
    for docid, document in corpus.documents.items():
        if is_full_text(document):
            newcorpus.documents[docid] = document

    return newcorpus

예제 #2

0

파일 보기

 def filter(self, documents):
     pycrf = PyCRFSuite(self.binary_model)
     for pmid, doc in documents:
         dataset = Dataset()
         dataset.documents[pmid] = doc
         self.pipeline.execute(dataset)
         self.labeler.label(dataset)
         pycrf.tag(dataset, MUT_CLASS_ID)
         PostProcessing().process(dataset)
         ExclusiveNLDefiner().define(dataset)
         total_nl_mentions = []
         for part in doc:
             # print(part.annotations)
             print_verbose('predicted_annotations:',
                           part.predicted_annotations)
             nl_mentions = [
                 (ann.text, ann.subclass, ann.confidence)
                 for ann in part.predicted_annotations
                 if ann.subclass != 0 and ann.confidence <= self.threshold
             ]
             total_nl_mentions += nl_mentions
         if any(total_nl_mentions):
             print('nl mentions', json.dumps(total_nl_mentions, indent=4))
             yield pmid, doc
         print_verbose('nothing found')

예제 #3

0

파일 보기

def get_corpora(names, only_class_id=None):
    dataset = Dataset()

    for name in names.split(','):
        dataset.extend_dataset(get_corpus(name, only_class_id=only_class_id))

    return dataset

예제 #4

0

파일 보기

파일: test_data.py 프로젝트: zxsted/nalaf

    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        part1 = Part('123')
        part2 = Part('45678')
        ann1 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='2',
                      confidence=0)
        ann2 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='567',
                      confidence=1)
        ann1.subclass = 0
        ann2.subclass = 2
        part1.annotations.append(ann1)
        part2.annotations.append(ann2)
        cls.doc.parts['s1h1'] = part1
        cls.doc.parts['s2p1'] = part2

        doc2 = Document()
        doc3 = Document().parts['someid'] = Part('marmor stein und eisen')
        cls.dataset2 = Dataset()
        cls.dataset2.documents['newid'] = doc3
        cls.dataset2.documents['testid'] = doc2

예제 #5

0

파일 보기

파일: test_evaluators.py 프로젝트: zxsted/nalaf

    def test_DocumentLevelRelationEvaluator_parts_irrelevant(self):

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        dataset = Dataset()
        doc_1 = Document()
        part_1 = Part('_irrelevant_ PART *1*')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        part_2 = Part('_irrelevant_ PART *2*')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_2'] = part_2

        part_1.relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        # -

        part_2.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_2, 0, "maynard"), Entity(STUB_E_ID_1, 0, "TOOL")),
        ]

        self._apply_pipeline(dataset)

        # ---

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)

예제 #6

0

파일 보기

파일: readers.py 프로젝트: zxsted/nalaf

    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        xmls = []
        if os.path.isdir(self.path):
            xmls = [
                os.path.join(root, file)
                for root, _, files in os.walk(self.path) for file in files
                if file.startswith('medline') and file.endswith('xml')
            ]
        elif self.path.startswith('medline') and self.path.endswith('xml'):
            xmls = [self.path]

        dataset = Dataset()

        for xml in xmls:
            for child in ET.parse(xml).getroot():
                pmid = next(child.iter('PMID')).text

                document = Document()
                article = next(child.iter('Article'))
                title = next(article.iter('ArticleTitle')).text
                document.parts['title'] = Part(title, is_abstract=False)
                try:
                    abstract = next(article.iter('AbstractText')).text
                    document.parts['abstract'] = Part(abstract)
                except StopIteration:
                    pass
                dataset.documents[pmid] = document

        return dataset

예제 #7

0

파일 보기

파일: readers.py 프로젝트: zxsted/nalaf

    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()

        with open(self.corpus_file, encoding='utf-8') as file:

            for row in file:
                columns = row.split("\t")

                docid = columns[0]
                typ = columns[1]
                start = columns[2]
                end = columns[3]
                entity_text = columns[7]

                class_id = None
                if typ == 'Mutation':
                    class_id = self.mut_class_id
                elif typ == 'AminoacidResidue':
                    class_id = self.residue_class_id

                if class_id:
                    document = dataset.documents.get(docid, Document())

                    part = Part(entity_text)
                    document.parts[typ + '|' + start + '|' + end] = part

                    part.annotations.append(
                        Entity(class_id, int(start), entity_text))

                    dataset.documents[docid] = document

        return dataset

예제 #8

0

파일 보기

파일: readers.py 프로젝트: zxsted/nalaf

    def read_file(a_file,
                  filename,
                  dataset=None,
                  whole_basename_as_docid=False):
        if dataset is None:
            dataset = Dataset()

        soup = BeautifulSoup(a_file, "html.parser")
        document = Document()

        for part in soup.find_all(id=re.compile('^s')):
            if re.match(r'^s[3-9]', part['id']):
                is_abstract = False
            else:
                is_abstract = True
            document.parts[part['id']] = Part(str(part.string),
                                              is_abstract=is_abstract)

        doc_id = os.path.basename(filename).replace('.plain.html', '').replace(
            '.html', '').replace('.xml', '')
        if not whole_basename_as_docid and '-' in doc_id:
            doc_id = doc_id.split('-')[-1]

        dataset.documents[doc_id] = document

        return dataset

예제 #9

0

파일 보기

파일: annotation_readers.py 프로젝트: lcy081099/nalaf

    def __is_acceptable(self, doc_id, doc, annotators):
        if len(annotators) == 1:
            return True

        from itertools import combinations
        from nalaf.structures.data import Dataset
        from nalaf.learning.evaluators import MentionLevelEvaluator
        import math

        agreement = []
        for first, second in combinations(annotators, 2):
            data = Dataset()
            data.documents[doc_id] = doc

            AnnJsonAnnotationReader(first).annotate(data)
            AnnJsonAnnotationReader(second, is_predicted=True).annotate(data)
            results = MentionLevelEvaluator().evaluate(data)
            if not math.isnan(results[-1]):
                agreement.append(results[-1])

        # clean the doc from any annotations we added to calculate agreement
        for part in doc.parts.values():
            part.annotations = []
            part.predicted_annotations = []

        return agreement and sum(agreement)/len(agreement) >= self.iaa_threshold

예제 #10

0

파일 보기

파일: getIAA.py 프로젝트: marilenaoita/nala

def benchmark_nala(member1, member2):
    itrs = []

    # Read the IAA iterations in blocks so that the plain documents are not deleted with the AnnJsonAnnotationReader's
    for itr in IterationRound.all():
        if itr.is_IAA():
            dataset = itr.read(read_annotations=False)
            AnnJsonAnnotationReader(
                os.path.join(itr.path, "reviewed", member1),
                read_only_class_id=MUT_CLASS_ID,
                delete_incomplete_docs=False).annotate(dataset)
            AnnJsonAnnotationReader(os.path.join(itr.path, "reviewed",
                                                 member2),
                                    read_only_class_id=MUT_CLASS_ID,
                                    delete_incomplete_docs=False,
                                    is_predicted=True).annotate(dataset)
            itrs.append(dataset)
            dataset = None

    # Then merge the IAA iterations
    all_itrs_dataset = Dataset()
    for itr_dataset in itrs:
        all_itrs_dataset.extend_dataset(itr_dataset)

    ExclusiveNLDefiner().define(all_itrs_dataset)

    return (all_itrs_dataset, MentionLevelEvaluator(
        subclass_analysis=True).evaluate(all_itrs_dataset))

예제 #11

0

파일 보기

파일: test_evaluators.py 프로젝트: zxsted/nalaf

 def _create_basic_dataset(self):
     dataset = Dataset()
     doc_1 = Document()
     part_1 = Part('_irrelevant_')
     dataset.documents['doc_1'] = doc_1
     doc_1.parts['part_1'] = part_1
     self._apply_pipeline(dataset)
     return (dataset, part_1)

예제 #12

0

파일 보기

    def test_DocumentLevelRelationEvaluator_default_entities_case_irrelevant(
            self):

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        dataset = Dataset()
        doc_1 = Document()
        part_1 = Part('_irrelevant_')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        part_1.relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"),
                     Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        # -

        part_1.predicted_relations = [
            # empty
        ]

        self._apply_pipeline(dataset)

        # -

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.0)

        # ---

        part_1.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"),
                     Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)

        # -

        part_1.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "tool"),
                     Entity(STUB_E_ID_2, 0, "MAYNARD")),
        ]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)

예제 #13

0

파일 보기

파일: readers.py 프로젝트: zxsted/nalaf

 def read(self):
     """
     :returns: nalaf.structures.data.Dataset
     """
     dataset = Dataset()
     with DownloadArticle() as da:
         for pmid, doc in da.download(self.pmids):
             dataset.documents[pmid] = doc
     return dataset

예제 #14

0

파일 보기

파일: test_stemming.py 프로젝트: Trixter9994/lazero

    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)],
                          [Token('Try', 18), Token('tried', 22), Token('tries', 28)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.generator = PorterStemFeatureGenerator()

예제 #15

0

파일 보기

    def generate_abstracts(self, list_of_pmids):
        """
        Generates list of documents using pmids and the restapi interface from tmtools.
        Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/"
        :param list_of_pmids: strings
        :return nalaf.structures.Dataset: dataset
        """
        # if os.path.isfile('cache.json'):
        #     with open('cache.json') as f:
        #           tm_var = json.load()
        # else:
        url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/'
        url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'

        # load cache.json if exists
        if os.path.exists('cache.json'):
            with open('cache.json', 'r', encoding='utf-8') as f:
                tm_var = json.load(f)
        else:
            tm_var = {}

        for pmid in list_of_pmids:
            if pmid not in tm_var:  # if pmid was not already downloaded from tmTools
                req = requests.get(url_tmvar.format(pmid))
                try:
                    tm_var[pmid] = req.json()
                except ValueError:
                    pass
        # cache the tmVar annotations so we don't pull them every time
        with open('cache.json', 'w') as file:
            json.dump(tm_var, file, indent=4)

        # for key in tm_var:
        #     print(json.dumps(tm_var[key], indent=4))

        dataset = Dataset()
        for doc_id in list_of_pmids:
            if doc_id in tm_var:
                doc = Document()
                text = tm_var[doc_id]['text']
                part = Part(text)
                denotations = tm_var[doc_id]['denotations']
                annotations = []
                for deno in denotations:
                    ann = Entity(
                        class_id=self.mut_class_id,
                        offset=int(deno['span']['begin']),
                        text=text[deno['span']['begin']:deno['span']['end']])
                    annotations.append(ann)
                    # note should the annotations from tmvar go to predicted_annotations or annotations?
                part.annotations = annotations
                doc.parts['abstract'] = part
                dataset.documents[doc_id] = doc

        return dataset

예제 #16

0

파일 보기

파일: readers.py 프로젝트: zxsted/nalaf

    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        part = Part(self.string)
        document = Document()
        dataset = Dataset()

        dataset.documents['doc_1'] = document
        document.parts['part_1'] = part

        return dataset

예제 #17

0

파일 보기

파일: test_simple.py 프로젝트: zxsted/nalaf

class TestSimpleFeatureGenerator(unittest.TestCase):
    def setUp(self):
        part = Part('Word1 word2 word3. Word4 word5 word6.')
        part.sentences = [[
            Token('Word1', 0),
            Token('word2', 6),
            Token('word3', 12)
        ], [Token('Word4', 19),
            Token('word5', 25),
            Token('word6', 31)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.simple_generator = SimpleFeatureGenerator()
        self.sentence_generator = SentenceMarkerFeatureGenerator()

    def test_simple_generate(self):
        self.simple_generator.generate(self.dataset)
        features = [token.features for token in self.dataset.tokens()]
        expected = iter([{
            'word[0]': 'Word1'
        }, {
            'word[0]': 'word2'
        }, {
            'word[0]': 'word3'
        }, {
            'word[0]': 'Word4'
        }, {
            'word[0]': 'word5'
        }, {
            'word[0]': 'word6'
        }])
        for feature in features:
            self.assertEqual(feature, next(expected))

    def test_sentence_generate(self):
        self.sentence_generator.generate(self.dataset)
        features = [token.features for token in self.dataset.tokens()]
        expected = iter([{
            'BOS[0]': 1
        }, {}, {
            'EOS[0]': 1
        }, {
            'BOS[0]': 1
        }, {}, {
            'EOS[0]': 1
        }])

        for feature in features:
            self.assertEqual(feature, next(expected))

예제 #18

0

파일 보기

파일: test_data.py 프로젝트: zxsted/nalaf

    def setUpClass(cls):
        text1 = "Flowers in the Rain. Are absolutely marvellous. Though i would say this text is stupid. Cheers!"

        part1 = Part(text1)
        doc = Document()
        doc.parts['firstpart'] = part1
        dataset = Dataset()
        dataset.documents['firstdocument'] = doc

        NLTKSplitter().split(dataset)
        # TmVarTokenizer().tokenize(dataset)
        cls.data = dataset
        cls.testpart = dataset.documents['firstdocument'].parts['firstpart']

예제 #19

0

파일 보기

파일: test_window.py 프로젝트: zxsted/nalaf

    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[
            Token('Make', 0),
            Token('making', 5),
            Token('made', 12)
        ], [Token('Try', 18),
            Token('tried', 22),
            Token('tries', 28)]]
        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        for token in self.dataset.tokens():
            token.features['a'] = 'a'
            token.features['b'] = 'b'

예제 #20

0

파일 보기

파일: test_data.py 프로젝트: zxsted/nalaf

    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        cls.part = Part(
            'Here is a random sentence for the benefit of your mamma')
        cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                            offset=10,
                            text='random sentence',
                            confidence=0)
        cls.part.annotations.append(cls.entity)
        cls.doc.parts['s1h1'] = cls.part

        # Apply through pipeline

        NLTKSplitter().split(cls.dataset)
        NLTK_TOKENIZER.tokenize(cls.dataset)

        nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(nlp)
        cls.parser.parse(cls.dataset)
        # cls.part.percolate_tokens_to_entities()

        cls.sentence = cls.part.sentences[0]

예제 #21

0

파일 보기

    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing')

        entities = [
            # Sent 1
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0),
            # Sent 2
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0),
            # Sent 3
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0),
            # Sent 4

        ]

        for e in entities:
            part1.annotations.append(e)

        cls.doc.parts['s1h1'] = part1

        cls.splitter = NLTKSplitter()
        cls.tokenizer = NLTK_TOKENIZER

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)

        # assert False, str(list(cls.dataset.sentences()))
        assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))

예제 #22

0

파일 보기

파일: readers.py 프로젝트: zxsted/nalaf

    def read(self):
        """
        read each .txt file in the directory, parse it and create and instance of Document
        form a dataset consisting of every document parsed and return it

        :returns structures.data.Dataset
        """
        dataset = Dataset()
        with open(self.corpus_file, encoding='utf-8') as file:
            reader = csv.reader(file, delimiter='\t')
            for row in reader:
                document = Document()
                document.parts['abstract'] = Part(row[1])
                dataset.documents[row[0]] = document

        return dataset

예제 #23

0

파일 보기

파일: test_simple.py 프로젝트: zxsted/nalaf

    def setUp(self):
        part = Part('Word1 word2 word3. Word4 word5 word6.')
        part.sentences = [[
            Token('Word1', 0),
            Token('word2', 6),
            Token('word3', 12)
        ], [Token('Word4', 19),
            Token('word5', 25),
            Token('word6', 31)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.simple_generator = SimpleFeatureGenerator()
        self.sentence_generator = SentenceMarkerFeatureGenerator()

예제 #24

0

파일 보기

파일: test_graphs.py 프로젝트: Trixter9994/lazero

    def setUpClass(cls):
        cls.dataset = Dataset()

        doc1 = Document()
        cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1

        for s in TEST_SENTENCES_SINGLE_ROOT:
            part = Part(s)
            doc1.parts[s] = part

        doc2 = Document()
        cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2

        for s in TEST_SENTENCES_MULTI_ROOT:
            part = Part(s)
            doc2.parts[s] = part

        cls.nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(cls.nlp)
        cls.splitter = NLTKSplitter()
        cls.tokenizer = GenericTokenizer(
            lambda string: (tok.text for tok in cls.nlp.tokenizer(string)))

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)
        cls.parser.parse(cls.dataset)

        cls.computed_sentences = []

        for sentence in cls.dataset.sentences():
            dist, then = compute_shortest_paths(sentence)
            cls.computed_sentences.append((dist, then, sentence))

예제 #25

0

파일 보기

    def __read_directory_localfs(self):
        dataset = Dataset()

        filenames = glob.glob(str(self.path + "/**/*.html"), recursive=True) + glob.glob(str(self.path + "/**/*.xml"), recursive=True)
        for filename in filenames:
            dataset = self.__read_file_path_localfs(filename, dataset)

        return dataset

예제 #26

0

파일 보기

    def __read_file_path_localfs(self, filename, dataset=None):
        if dataset is None:
            dataset = Dataset()

        with open(filename, 'rb') as a_file:
            HTMLReader.read_file(a_file, filename, dataset, self.whole_basename_as_docid)

        return dataset

예제 #27

0

파일 보기

    def __read_directory_hdfs(self):
        dataset = Dataset()

        filenames = walk_hdfs_directory(self.hdfs_client, self.path, lambda fname: fname.endswith(".html") or fname.endswith(".xml"))
        for filename in filenames:
            dataset = self.__read_file_path_hdfs(filename, dataset)

        return dataset

예제 #28

0

파일 보기

    def __read_file_path_hdfs(self, filename, dataset=None):
        if dataset is None:
            dataset = Dataset()

        with self.hdfs_client.read(filename) as reader:
            HTMLReader.read_file(reader, filename, dataset, self.whole_basename_as_docid)

        return dataset

예제 #29

0

파일 보기

파일: test_spliters.py 프로젝트: zxsted/nalaf

 def setUpClass(cls):
     cls.dataset = Dataset()
     doc = Document()
     part = Part(
         'This is one sentence. This is another one.\n This is the third one; here continues.'
     )
     cls.dataset.documents['doc_1'] = doc
     doc.parts['part_1'] = part

예제 #30

0

파일 보기

파일: readers.py 프로젝트: zxsted/nalaf

    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()
        if os.path.isdir(self.path):
            for filename in glob.glob(self.path + '/*.txt'):
                doc_id, doc = self.__process_file(filename)
                dataset.documents[doc_id] = doc
        else:
            if os.path.splitext(self.path)[-1] == '.txt':
                doc_id, doc = self.__process_file(self.path)
                dataset.documents[doc_id] = doc
            else:
                raise Exception('not a .txt file extension')

        return dataset