Exemplo n.º 1
0
    def test_paths_to_blocks_with_missing(self, test_data_001, test_data_002,
                                          make_target_series, splitter_func,
                                          test_data_no_file):
        no_file_ds = pd.Series([''], index=[test_data_no_file.uid])
        target_ds = pd.concat([
            make_target_series(test_data_001),
            make_target_series(test_data_002), no_file_ds
        ],
                              axis=0)
        input = pd.Series([
            test_data_001.pdf,
            test_data_002.pdf,
            test_data_no_file.pdf,
        ],
                          index=[
                              test_data_001.uid,
                              test_data_002.uid,
                              test_data_no_file.uid,
                          ])
        output = PDFDecoder.paths_to_blocks(input,
                                            split_func=splitter_func,
                                            missing_file='ignore')
        assert output.equals(target_ds)
        output = PDFDecoder.threaded_paths_to_blocks(input,
                                                     split_func=splitter_func,
                                                     missing_file='ignore')
        assert output.equals(target_ds)

        with pytest.raises(FileNotFoundError):
            PDFDecoder.threaded_paths_to_blocks(
                input,
                split_func=splitter_func,
                missing_file='raise',
            )
Exemplo n.º 2
0
 def test_invalid_return_type(self):
     text_series = pd.Series(
         ['first text\n\n2lines', 'second text\n\nline\n\nlineagain'])
     with pytest.raises(ValueError):
         PDFDecoder.threaded_texts_to_blocks(
             text_series,
             return_type='incorrect_input',
         )
     with pytest.raises(ValueError):
         PDFDecoder.text_to_blocks_series(
             text_series.iloc[0],
             return_type='incorrect_input',
         )
Exemplo n.º 3
0
 def test_path_to_blocks_series(self, test_data_001, test_data_002,
                                make_target_series, splitter_func):
     target = make_target_series(test_data_001)
     blocks_series = (PDFDecoder.path_to_blocks_series(
         test_data_001.pdf,
         split_func=splitter_func,
         index=test_data_001.uid))
     assert target.equals(blocks_series)
     target = make_target_series(test_data_002)
     blocks_series = (PDFDecoder.path_to_blocks_series(
         test_data_002.pdf,
         split_func=splitter_func,
         index=test_data_002.uid))
     assert target.equals(blocks_series)
Exemplo n.º 4
0
 def test_empty_content_raise(
     self,
     test_data_001,
     test_data_002,
     make_target_series,
     splitter_func,
 ):
     content_list = []
     for test_data in [test_data_001, test_data_002]:
         with open(test_data.pdf, mode='rb') as content:
             content_list.append(content.read())
     content_list[0] = None
     content_ds = pd.Series(content_list,
                            index=[test_data_001.uid, test_data_002.uid])
     # Case where error should be raised (content input is None)
     with pytest.raises(RuntimeError):
         PDFDecoder.threaded_contents_to_text(content_ds,
                                              none_content='raise')
Exemplo n.º 5
0
 def test_texts_to_blocks_as_list(self):
     text_series = pd.Series(
         ['first text\n\n2lines', 'second text\n\nline\n\nlineagain'],
         index=pd.Index(['001', '002']),
     )
     blocks_ds = (PDFDecoder.threaded_texts_to_blocks(
         text_series,
         return_type='as_list',
     ))
     target = pd.Series(
         [['first text', '2lines'], ['second text', 'line', 'lineagain']],
         index=pd.Index(['001', '002']))
     assert blocks_ds.equals(target)
     blocks_ds = PDFDecoder.text_to_blocks_series('first text\n\n2lines',
                                                  index='001',
                                                  return_type='as_list')
     print(blocks_ds)
     print(target.loc['001'])
     assert blocks_ds.loc['001'] == target.loc['001']
Exemplo n.º 6
0
 def test_paths_to_blocks(self, test_data_001, test_data_002,
                          make_target_series, splitter_func):
     target_ds = pd.concat([
         make_target_series(test_data_001),
         make_target_series(test_data_002)
     ],
                           axis=0)
     input = pd.Series([test_data_001.pdf, test_data_002.pdf],
                       index=[test_data_001.uid, test_data_002.uid])
     output = PDFDecoder.paths_to_blocks(input, split_func=splitter_func)
     assert output.equals(target_ds)
Exemplo n.º 7
0
 def test_content_to_text_to_blocks(
     self,
     test_data_001,
     test_data_002,
     make_target_series,
     splitter_func,
 ):
     target_ds = pd.concat([
         make_target_series(test_data_001),
         make_target_series(test_data_002)
     ],
                           axis=0)
     content_list = []
     for test_data in [test_data_001, test_data_002]:
         with open(test_data.pdf, mode='rb') as content:
             content_list.append(content.read())
     content_ds = pd.Series(content_list,
                            index=[test_data_001.uid, test_data_002.uid])
     text_ds = PDFDecoder.threaded_contents_to_text(content_ds)
     blocks_ds = (PDFDecoder.threaded_texts_to_blocks(
         text_ds, split_func=splitter_func))
     assert blocks_ds.equals(target_ds)
Exemplo n.º 8
0
def index():
    if request.method == 'POST':
        session.clear()
        attached = request.files['attached']
        text = PDFDecoder.content_to_text(attached)
        splitted = pd.Series([splitter(text)])
        result = simselector.predict(splitted).iloc[0]
        session['result'] = result
        attached.stream.seek(0)
        session['filecontent'] = attached.read()
        return redirect(url_for('home.result'))
    else:
        return render_template('home/index.html')
Exemplo n.º 9
0
 def test_path_to_blocks(self, test_data_001, splitter_func):
     target = pd.read_csv(
         test_data_001.blocks,
         encoding='utf-8-sig',
         sep=';',
         header=None,
         names=['blocks'],
         squeeze=True,
     )
     target = list(target)
     target.append('\x0c')
     blocks = PDFDecoder.path_to_blocks(test_data_001.pdf,
                                        split_func=splitter_func)
     assert target == blocks
Exemplo n.º 10
0
 def test_invalid_arg(
     self,
     test_data_001,
     test_data_002,
     make_target_series,
     splitter_func,
 ):
     content_list = []
     for test_data in [test_data_001, test_data_002]:
         with open(test_data.pdf, mode='rb') as content:
             content_list.append(content.read())
     content_ds = pd.Series(content_list,
                            index=[test_data_001.uid, test_data_002.uid])
     # Unexpected none_content argument
     with pytest.raises(ValueError):
         (PDFDecoder.threaded_contents_to_text(
             content_ds, none_content='incorrect_input'))
Exemplo n.º 11
0
 def test_empty_content_empty(
     self,
     test_data_001,
     test_data_002,
     make_target_series,
     splitter_func,
 ):
     # Passing case where content input is empty string
     content_list = []
     for test_data in [test_data_001, test_data_002]:
         with open(test_data.pdf, mode='rb') as content:
             content_list.append(content.read())
     content_list[0] = b''
     content_ds = pd.Series(content_list,
                            index=[test_data_001.uid, test_data_002.uid])
     texts_ds = (PDFDecoder.threaded_contents_to_text(
         content_ds, none_content='to_empty'))
     assert texts_ds.iloc[0] == ''
     assert (texts_ds.iloc[1] == (
         Path(test_data_002.txt).read_text(encoding='utf-8-sig') + '\x0c'))
Exemplo n.º 12
0
 def test_path_to_text_incorrect_params(self):
     with pytest.raises(ValueError):
         PDFDecoder.path_to_text('', missing_file='incorrect')
Exemplo n.º 13
0
 def test_path_to_text_corrupted(self):
     path = Path(__file__).parent / 'test_data' / '003_corrupted_pdf.pdf'
     assert PDFDecoder.path_to_text(path, missing_file='raise') == ''
Exemplo n.º 14
0
 def test_path_to_text_no_file(self):
     assert PDFDecoder.path_to_text('incorect/path',
                                    missing_file='ignore') == ''
     with pytest.raises(FileNotFoundError):
         PDFDecoder.path_to_text('incorect/path')
Exemplo n.º 15
0
 def test_path_to_text_2(self, test_data_002):
     target = Path(test_data_002.txt).read_text(encoding='utf-8-sig')
     target = target + '\x0c'
     assert PDFDecoder.path_to_text(test_data_002.pdf) == target
Exemplo n.º 16
0
 def test_path_to_text(self, test_data_001):
     target = Path(test_data_001.txt).read_text()
     target += '\x0c'
     assert PDFDecoder.path_to_text(test_data_001.pdf) == target
Exemplo n.º 17
0
 def test_incorrect_content_type(self):
     with pytest.raises(AttributeError):
         PDFDecoder.content_to_text(b'', none_content='to_empty')
Exemplo n.º 18
0
 def test_none_content(self):
     content = BytesIO(b'')
     with pytest.raises(RuntimeError):
         PDFDecoder.content_to_text(content, none_content='raise')
     assert (PDFDecoder.content_to_text(content,
                                        none_content='to_empty') == '')
Exemplo n.º 19
0
 def test_invalid_none_content_arg(self):
     with pytest.raises(ValueError):
         PDFDecoder.content_to_text(b'', none_content='incorrect_input')