Пример #1
0
    def test_render(self):
        # No columns specified, no output
        out = execute_wfmodule(self.wf_module)
        self.assertTrue(out.empty)

        # Basic search
        self.query_pval.set_value('Feb')
        self.colnames_pval.set_value('Month')

        out = execute_wfmodule(self.wf_module)
        self.assertEqual(str(out), '  Month  Amount\n1   Feb      20')

        # Case sensitive - should return nothing because no match
        self.query_pval.set_value('feb')
        self.case_pval.set_value(True)

        out = execute_wfmodule(self.wf_module)
        self.assertTrue(out.empty)

        # Regex
        self.query_pval.set_value('Jan|Feb')
        self.regex_pval.set_value(True)

        out = execute_wfmodule(self.wf_module)
        self.assertEqual(str(out),
                         '  Month  Amount\n0   Jan      10\n1   Feb      20')
    def test_render(self):
        # select a single column
        self.cols_pval.value = 'Month'
        self.cols_pval.save()
        out = execute_wfmodule(self.wf_module)
        table = mock_csv_table[['Month']]
        self.assertEqual(str(out), str(table))

        # select a single column, with stripped whitespace
        self.cols_pval.value = 'Month '
        self.cols_pval.save()
        out = execute_wfmodule(self.wf_module)
        self.assertEqual(str(out), str(table))

        # reverse column order, should not reverse
        self.cols_pval.value = 'Amount,Month'
        self.cols_pval.save()
        out = execute_wfmodule(self.wf_module)
        table = mock_csv_table[['Month', 'Amount']]
        self.assertEqual(str(out), str(mock_csv_table))

        # bad column name should produce error
        self.cols_pval.value = 'Amountxxx,Month'
        self.cols_pval.save()
        out = execute_wfmodule(self.wf_module)
        self.wf_module.refresh_from_db()
        self.assertEqual(self.wf_module.status, WfModule.ERROR)
Пример #3
0
    def test_execute_cache_hit(self):
        workflow = create_testdata_workflow(table_csv)
        wf_module2 = load_and_add_module('selectcolumns', workflow=workflow)
        # Execute -- which should cache the result
        expected = execute_wfmodule(wf_module2)

        with mock.patch('server.dispatch.module_dispatch_render') as mdr:
            result = execute_wfmodule(wf_module2)
            self.assertFalse(mdr.called)
            self.assertEqual(result, expected)
Пример #4
0
    def test_bad_colname(self):
        # No output if no column given
        set_string(self.col_pval, '')
        out = execute_wfmodule(self.wf_module)
        self.assertTrue(out.empty)

        # bad column name should produce error
        set_string(self.col_pval,'hilarious')
        out = execute_wfmodule(self.wf_module)
        self.wf_module.refresh_from_db()
        self.assertEqual(self.wf_module.status, WfModule.ERROR)
Пример #5
0
    def test_bad_dates(self):
        # integers are not dates
        set_string(self.col_pval, 'Amount')
        out = execute_wfmodule(self.wf_module)
        self.wf_module.refresh_from_db()
        self.assertEqual(self.wf_module.status, WfModule.ERROR)

        # Weird strings are not dates (different error code path)
        set_string(self.col_pval, 'Foo')
        out = execute_wfmodule(self.wf_module)
        self.wf_module.refresh_from_db()
        self.assertEqual(self.wf_module.status, WfModule.ERROR)
Пример #6
0
    def test_render(self):
        # sort by value.
        # Use out.to_csv() instead of str(out) to ensure rows are output in index order (otherwise variable)
        set_string(self.col_pval, 'Date')
        set_integer(self.sort_pval,0)  # 0 = sort by value
        out = execute_wfmodule(self.wf_module)
        self.assertEqual(out.to_csv(index=False), 'date,count\n2011-07-25,1\n2016-01-10,2\n' )

        # sort by freq
        set_integer(self.sort_pval,1)  # 1 = sort by freq
        out = execute_wfmodule(self.wf_module)
        self.assertEqual(out.to_csv(index=False), 'date,count\n2016-01-10,2\n2011-07-25,1\n')
Пример #7
0
 def test_execute_revision_0(self):
     # Don't crash on a new workflow (rev=0, no caches)
     workflow = create_testdata_workflow(table_csv)
     wf_module2 = load_and_add_module('selectcolumns', workflow=workflow)
     result = execute_wfmodule(wf_module2)
     self.assertEqual(result, ProcessResult(table_dataframe))
     self.assertEqual(cached_render_result_revision_list(workflow), [0, 0])
Пример #8
0
 def test_rename(self):
     self.entries_pval.value = json.dumps({'name': 'name1', 'count': 'CNT'})
     self.entries_pval.save()
     result = execute_wfmodule(self.wf_module)
     expected_table = reference_table.copy()
     expected_table.columns = ['name1', 'date', 'CNT', 'float']
     self.assertEqual(result, ProcessResult(expected_table))
Пример #9
0
 def test_reorder(self):
     # In chronological order, starting with
     # ['name', 'date', 'count', 'float']
     reorder_ops = [
         {
             'column': 'count',
             'from': 2,
             'to': 0
         },  # gives ['count', 'name', 'date', 'float']
         {
             'column': 'name',
             'from': 1,
             'to': 2
         },  # gives ['count', 'date', 'name', 'float']
         {
             'column': 'float',
             'from': 3,
             'to': 1
         },  # gives ['count', 'float', 'date', 'name']
     ]
     self.history_pval.value = json.dumps(reorder_ops)
     self.history_pval.save()
     result = execute_wfmodule(self.wf_module)
     self.assertEqual(result,
                      reordered_result(['count', 'float', 'date', 'name']))
Пример #10
0
    def test_scrape_table(self):
        url = 'http://test.com/tablepage.html'
        self.url_pval.set_value(url)
        self.url_pval.save()

        # should be no data saved yet, no Deltas on the workflow
        self.assertIsNone(self.wfmodule.get_fetched_data_version())
        self.assertIsNone(self.wfmodule.retrieve_fetched_table())
        self.assertIsNone(self.wfmodule.workflow.last_delta)

        with mock.patch('pandas.read_html') as readmock:
            readmock.return_value = [mock_csv_table]
            self.press_fetch_button()
            self.assertEqual(readmock.call_args,
                             mock.call(url, flavor='html5lib'))

        result = execute_wfmodule(self.wfmodule)
        self.assertEqual(result, ProcessResult(mock_csv_table))

        # should create a new data version on the WfModule, and a new delta
        # representing the change
        self.wfmodule.refresh_from_db()
        self.wfmodule.workflow.refresh_from_db()
        self.assertIsNotNone(self.wfmodule.get_fetched_data_version())
        self.assertIsNotNone(self.wfmodule.workflow.last_delta)
Пример #11
0
    def test_scrape_list(self):
        source_options = "List|Input column".split('|')
        source_pval = get_param_by_id_name('urlsource')
        source_pval.value = source_options.index('List')
        source_pval.save()

        get_param_by_id_name('urllist').set_value('\n'.join([
            'http://a.com/file',
            'https://b.com/file2',
            'c.com/file/dir'  # Removed 'http://' to test the URL-fixing part
        ]))

        # Code below mostly lifted from the column test
        async def mock_scrapeurls(urls, table):
            table['status'] = self.scraped_table['status']
            table['html'] = self.scraped_table['html']
            return

        with mock.patch('django.utils.timezone.now') as now:
            now.return_value = testnow

            with mock.patch('server.modules.urlscraper.scrape_urls') as scrape:
                # call the mock function instead, the real fn is tested above
                scrape.side_effect = mock_scrapeurls

                self.press_fetch_button()
                result = execute_wfmodule(self.wfmodule)
                self.assertEqual(result, ProcessResult(self.scraped_table))
Пример #12
0
def wfmodule_columns(request, pk, format=None):
    if request.method == 'GET':
        try:
            wf_module = WfModule.objects.get(pk=pk)
        except WfModule.DoesNotExist:
            return HttpResponseNotFound()

        if not wf_module.workflow.user_authorized_read(request.user):
            return HttpResponseForbidden()

        with wf_module.workflow.cooperative_lock():
            table = execute_wfmodule(wf_module)
            dtypes = table.dtypes.to_dict()

        ret_types = []
        for col in dtypes:
            # We are simplifying the data types here.
            # More stuff can be added to these lists if we run into anything new.
            stype = "String"
            if str(dtypes[col]) in ['int64', 'float64', 'bool']:
                stype = "Number"
            elif str(dtypes[col]) in ['datetime64[ns]']:
                ret_types.append((col, "Date"))
                stype = "Date"
            ret_types.append({
                "name": col,
                "type": stype
            })
        return HttpResponse(json.dumps(ret_types), content_type="application/json")
Пример #13
0
 def test_missing_column(self):
     # If an input column is removed (e.g. via select columns)
     # then reorders which refer to it simply do nothing
     reorder_ops = [
         # starts from ['name', 'date', 'count', 'float']
         {
             'column': 'count',
             'from': 2,
             'to': 0
         },  # gives ['count', 'name', 'date', 'float']
         {
             'column': 'nonexistent-name',
             'from': 4,
             'to': 1
         },  # invalid, nop
         {
             'column': 'count',
             'from': 0,
             'to': 4
         },  # invalid, nop
         {
             'column': 'float',
             'from': 3,
             'to': 2
         },  # gives ['count', 'name', 'float', 'date']
     ]
     self.history_pval.value = json.dumps(reorder_ops)
     self.history_pval.save()
     result = execute_wfmodule(self.wf_module)
     self.assertEqual(result,
                      reordered_result(['count', 'name', 'float', 'date']))
Пример #14
0
def execute_and_notify(wf_module):
    """
    Render (and cache) a WfModule; send websocket updates and return result.
    """
    workflow = wf_module.workflow
    with workflow.cooperative_lock():
        priors = {}
        for a_wf_module in workflow.wf_modules.all():
            priors[a_wf_module.id] = \
                _client_attributes_that_change_on_render(a_wf_module)

        result = execute.execute_wfmodule(wf_module)

        changes = {}
        for a_wf_module in workflow.wf_modules.all():
            prior = priors[a_wf_module.id]
            current = _client_attributes_that_change_on_render(a_wf_module)

            if current != prior:
                changes[str(a_wf_module.id)] = current

    if changes:
        websockets.ws_client_send_delta_sync(wf_module.workflow_id, {
            'updateWfModules': changes
        })

    return result
Пример #15
0
    def test_resume_without_rerunning_unneeded_renders(self):
        workflow = create_testdata_workflow(table_csv)
        wf_module1 = workflow.wf_modules.first()
        wf_module2 = load_and_add_module('selectcolumns',
                                         workflow=workflow,
                                         last_relevant_delta_id=1)
        wf_module1.last_relevant_delta_id = 1
        wf_module1.save()

        expected = execute_wfmodule(wf_module2)

        wf_module2.refresh_from_db()
        wf_module2.last_relevant_delta_id = 2
        wf_module2.save()

        with mock.patch('server.dispatch.module_dispatch_render') as mdr:
            mdr.return_value = expected
            result = execute_wfmodule(wf_module2)
            mdr.assert_called_once()
            self.assertEqual(result, expected)
Пример #16
0
 def test_load_csv_bad_content_type(self):
     # return text/plain type and rely on filename detection, as
     # https://raw.githubusercontent.com/ does
     url = 'https://raw.githubusercontent.com/user/repo/branch/the.csv'
     self.url_pval.set_value(url)
     self.url_pval.save()
     with patch('requests.get') as get:
         get.return_value = mock_text_response(mock_csv_text, 'text/plain')
         self.press_fetch_button()
         result = execute_wfmodule(self.wfmodule)
         self.assertEqual(result, ProcessResult(mock_csv_table))
Пример #17
0
 def test_nop_with_initial_col_selection(self):
     # When a column is first selected and no scraping is performed, the
     # initial table should be returned
     source_options = "List of URLs|Load from column".split('|')
     source_pval = get_param_by_id_name('urlsource')
     source_pval.value = source_options.index('Load from column')
     source_pval.save()
     column_pval = get_param_by_id_name('urlcol')
     column_pval.value = 'url'
     column_pval.save()
     result = execute_wfmodule(self.wfmodule)
     self.assertEqual(result, self.expected_url_table_result)
Пример #18
0
    def test_render(self):
        # Replace the output with our own data

        code = "columns = ['A','B', 'C']\ndata = np.array([np.arange(5)]*3).T\nreturn pd.DataFrame(columns=columns, data=data)"
        self.code_pval.string = code
        self.code_pval.save()

        out = execute_wfmodule(self.wf_module)
        self.assertEqual(
            str(out),
            "   A  B  C\n0  0  0  0\n1  1  1  1\n2  2  2  2\n3  3  3  3\n4  4  4  4"
        )
Пример #19
0
def table_result(request, wf_module):
    # Get first and last row from query parameters, or default to all if not specified
    try:
        startrow = int_or_none(request.GET.get('startrow'))
        endrow = int_or_none(request.GET.get('endrow'))
    except ValueError:
        return Response({'message': 'bad row number', 'status_code': 400}, status=status.HTTP_400_BAD_REQUEST)

    with wf_module.workflow.cooperative_lock():
        table = execute_wfmodule(wf_module)
        j = make_render_json(table, startrow, endrow)
    return HttpResponse(j, content_type="application/json")
Пример #20
0
def wfmodule_render(request, pk, format=None):
    if request.method == 'GET':
        try:
            wf_module = WfModule.objects.get(pk=pk)
        except WfModule.DoesNotExist:
            return HttpResponseNotFound()

        if not wf_module.user_authorized(request.user):
            return HttpResponseForbidden()

        table = execute_wfmodule(wf_module)
        d = table.to_json(orient='records')
        return HttpResponse(d, content_type="application/json")
Пример #21
0
    def test_load_xlsx(self):
        url = 'http://test.com/the.xlsx'
        self.url_pval.set_value(url)
        self.url_pval.save()

        xlsx_bytes = open(mock_xlsx_path, "rb").read()
        xlsx_table = pd.read_excel(mock_xlsx_path)

        with patch('requests.get') as get:
            get.return_value = mock_bytes_response(xlsx_bytes, XLSX_MIME_TYPE)
            self.press_fetch_button()
            result = execute_wfmodule(self.wfmodule)
            self.assertEqual(result, ProcessResult(xlsx_table))
Пример #22
0
    def event(wfm, **kwargs):
        urls = []
        urlsource = wfm.get_param_menu_string('urlsource')

        if urlsource == 'List':
            urllist_text = wfm.get_param_string('urllist')
            urllist_raw = urllist_text.split('\n')
            for url in urllist_raw:
                s_url = url.strip()
                if len(s_url) == 0:
                    continue
                # Fix in case user adds an URL without http(s) prefix
                if not re.match('^https?://.*', s_url):
                    urls.append('http://{}'.format(s_url))
                else:
                    urls.append(s_url)
        elif urlsource == 'Input column':
            # get our list of URLs from a column in the input table
            urlcol = wfm.get_param_column('urlcol')
            if urlcol == '':
                return
            from server.execute import execute_wfmodule
            prev_table = execute_wfmodule(wfm.previous_in_stack()).dataframe

            # column parameters are not sanitized here, could be missing
            # this col
            if urlcol in prev_table.columns:
                urls = prev_table[urlcol].tolist()

        if len(urls) > 0:
            table = pd.DataFrame({
                'url': urls,
                'status': ''
            },
                                 columns=['url', 'date', 'status', 'html'])

            event_loop = get_thread_event_loop()
            event_loop.run_until_complete(scrape_urls(urls, table))

        else:
            table = pd.DataFrame()

        table['date'] = timezone.now().isoformat(timespec='seconds') \
                .replace('+00:00', 'Z')

        result = ProcessResult(dataframe=table)
        # No need to truncate: input is already truncated
        # No need to sanitize: we only added text+date+status

        ModuleImpl.commit_result(wfm, result)
Пример #23
0
    def test_render(self):
        # NOP if no column given
        set_string(self.col_pval, '')
        out = execute_wfmodule(self.wf_module)
        self.assertFalse(out.empty)

        # sort by value
        set_string(self.col_pval, 'Amount')
        set_integer(self.sort_pval, 0)
        out = execute_wfmodule(self.wf_module)
        self.assertEqual(
            str(out), '   Amount  count\n0       5      1\n1      10      2')

        # sort by freq
        set_integer(self.sort_pval, 1)
        out = execute_wfmodule(self.wf_module)
        self.assertEqual(
            str(out), '   Amount  count\n0      10      2\n1       5      1')

        # bad column name should produce error
        set_string(self.col_pval, 'hilarious')
        out = execute_wfmodule(self.wf_module)
        self.wf_module.refresh_from_db()
        self.assertEqual(self.wf_module.status, WfModule.ERROR)
Пример #24
0
    def test_first_row_is_header(self):
        url = 'http://test.com/tablepage.html'
        self.url_pval.set_value(url)
        self.url_pval.save()
        self.first_row_pval.set_value(True)
        self.first_row_pval.save()

        with mock.patch('pandas.read_html') as readmock:
            readmock.return_value = [mock_csv_table]
            self.press_fetch_button()
            self.assertEqual(readmock.call_args,
                             mock.call(url, flavor='html5lib'))

        result = execute_wfmodule(self.wfmodule)
        self.assertListEqual(list(result.dataframe.columns),
                             [str(x) for x in mock_csv_table.iloc[0, :]])
        self.assertEqual(len(result.dataframe), len(mock_csv_table) - 1)
Пример #25
0
def wfmodule_public_output(request, pk, type, format=None):
    try:
        wf_module = WfModule.objects.get(pk=pk)
    except WfModule.DoesNotExist:
        return HttpResponseNotFound()

    if not wf_module.user_authorized_read(request.user):
        return HttpResponseNotFound()

    table = execute_wfmodule(wf_module)
    if type == 'json':
        d = table.to_json(orient='records')
        return HttpResponse(d, content_type="application/json")
    elif type == 'csv':
        d = table.to_csv(index=False)
        return HttpResponse(d, content_type="text/csv")
    else:
        return HttpResponseNotFound()
Пример #26
0
def wfmodule_input(request, pk, format=None):
    if request.method == 'GET':
        try:
            wf_module = WfModule.objects.get(pk=pk)
        except WfModule.DoesNotExist:
            return HttpResponseNotFound()

        if not wf_module.user_authorized(request.user):
            return HttpResponseForbidden()

        prev_modules = WfModule.objects.filter(workflow=wf_module.workflow,
                                               order__lt=wf_module.order)
        if not prev_modules:
            table = pd.DataFrame()
        else:
            table = execute_wfmodule(prev_modules.last())

        d = table.to_json(orient='records')
        return HttpResponse(d, content_type="application/json")
Пример #27
0
    def test_execute_new_revision(self):
        workflow = create_testdata_workflow(table_csv)
        wf_module2 = load_and_add_module('selectcolumns', workflow=workflow)

        # Add command, modifying revision
        pval = get_param_by_id_name('colnames', wf_module=wf_module2)
        ChangeParameterCommand.create(pval, 'A')

        self.assertEqual(cached_render_result_revision_list(workflow),
                         [None, None])

        wf_module1 = workflow.wf_modules.first()
        wf_module1.last_relevant_delta_id = 1
        wf_module1.save()
        wf_module2.last_relevant_delta_id = 2
        wf_module2.save()

        result = execute_wfmodule(wf_module2)
        self.assertEqual(result, ProcessResult(table_dataframe[['A']]))
        self.assertEqual(cached_render_result_revision_list(workflow), [1, 2])
Пример #28
0
    def test_load_json(self):
        url = 'http://test.com/the.json'
        self.url_pval.set_value(url)
        self.url_pval.save()

        # use a complex example with nested data
        fname = os.path.join(settings.BASE_DIR,
                             'server/tests/test_data/sfpd.json')
        sfpd_json = open(fname).read()
        # OrderedDict otherwise cols get sorted
        sfpd_table = pd.DataFrame(
            json.loads(sfpd_json, object_pairs_hook=OrderedDict))
        expected = ProcessResult(sfpd_table)
        expected.sanitize_in_place()

        with patch('requests.get') as get:
            get.return_value = mock_text_response(sfpd_json,
                                                  'application/json')
            self.press_fetch_button()
            result = execute_wfmodule(self.wfmodule)
            self.assertEqual(result, expected)
Пример #29
0
def wfmodule_histogram(request, pk, col, format=None):
    if request.method == 'GET':
        try:
            wf_module = WfModule.objects.get(pk=pk)
        except WfModule.DoesNotExist:
            return HttpResponseNotFound()

        if not wf_module.workflow.user_authorized_read(request.user):
            return HttpResponseForbidden()

        INTERNAL_COUNT_COLNAME = '__internal_count_column__'

        prev_modules = WfModule.objects.filter(workflow=wf_module.workflow, order__lt=wf_module.order)
        if not prev_modules:
            return HttpResponse(make_render_json(pd.DataFrame()), content_type="application/json")
        table = execute_wfmodule(prev_modules.last())
        if col not in table.columns:
            return Response({'message': 'Column does not exist in module input', 'status_code': 400}, status=status.HTTP_400_BAD_REQUEST)
        hist_table = table.groupby(col).size().reset_index()
        hist_table.columns = [col, INTERNAL_COUNT_COLNAME]
        hist_table = hist_table.sort_values(by=[INTERNAL_COUNT_COLNAME, col], ascending=[False, True])
        hist_table[col] = hist_table[col].astype(str)

        return HttpResponse(make_render_json(hist_table), content_type="application/json")
Пример #30
0
def wfmodule_output(request, pk, format=None):
    if request.method == 'GET':
        try:
            wf_module = WfModule.objects.get(pk=pk)
        except WfModule.DoesNotExist:
            return HttpResponseNotFound()

        if not wf_module.workflow.user_authorized_read(request.user):
            return HttpResponseForbidden()

        table = execute_wfmodule(wf_module)

        html, input_data, params = module_dispatch_output(wf_module, table, request=request)

        input_data_json = make_render_json(input_data)

        init_data =  json.dumps({
            'input': json.loads(input_data_json),
            'params': params
        })

        js="""
        <script>
        var workbench = %s
        </script>""" % init_data

        head_tag_pattern = re.compile('<\w*[H|h][E|e][A|a][D|d]\w*>')
        result = head_tag_pattern.search(html)

        modified_html = '%s %s %s' % (
            html[:result.end()],
            js,
            html[result.end():]
        )

        return HttpResponse(content=modified_html)