def test_csv_export_with_df_size_limit(self): """ To fix pandas limitation of 30k rows on csv export, we specify a max number of records in a dataframe on export - lets test it """ self._publish_single_level_repeat_form() # submit 7 instances for i in range(4): self._submit_fixture_instance("new_repeats", "01") self._submit_fixture_instance("new_repeats", "02") for i in range(2): self._submit_fixture_instance("new_repeats", "01") csv_df_builder = CSVDataFrameBuilder(self.user.username, self.xform.id_string) record_count = csv_df_builder._query_mongo(count=True) self.assertEqual(record_count, 7) temp_file = NamedTemporaryFile(suffix=".csv", delete=False) csv_df_builder.export_to(temp_file.name, data_frame_max_size=3) csv_file = open(temp_file.name) csv_reader = csv.reader(csv_file) header = csv_reader.next() self.assertEqual( len(header), 17 + len(AbstractDataFrameBuilder.ADDITIONAL_COLUMNS)) rows = [] for row in csv_reader: rows.append(row) self.assertEqual(len(rows), 7) self.assertEqual(rows[4][5], NA_REP) # close and delete file csv_file.close() os.unlink(temp_file.name)
def to_flat_csv_export( self, path, data, username, id_string, filter_query): # TODO resolve circular import from onadata.apps.viewer.pandas_mongo_bridge import\ CSVDataFrameBuilder csv_builder = CSVDataFrameBuilder( username, id_string, filter_query, self.GROUP_DELIMITER, self.SPLIT_SELECT_MULTIPLES, self.BINARY_SELECT_MULTIPLES) csv_builder.export_to(path)
def test_csv_dataframe_export_to(self): self._publish_nested_repeats_form() self._submit_fixture_instance( "nested_repeats", "01", submission_time=self._submission_time) self._submit_fixture_instance( "nested_repeats", "02", submission_time=self._submission_time) csv_df_builder = CSVDataFrameBuilder(self.user.username, self.xform.id_string) temp_file = NamedTemporaryFile(suffix=".csv", delete=False) csv_df_builder.export_to(temp_file.name) csv_fixture_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "fixtures", "nested_repeats", "nested_repeats.csv" ) temp_file.close() fixture, output = '', '' with open(csv_fixture_path) as f: fixture = f.read() with open(temp_file.name) as f: output = f.read() os.unlink(temp_file.name) self.assertEqual(fixture, output)
def test_split_select_multiples_within_repeats(self): self.maxDiff = None record = { 'name': 'Tom', 'age': 23, 'browser_use': [ { 'browser_use/year': '2010', 'browser_use/browsers': 'firefox safari' }, { 'browser_use/year': '2011', 'browser_use/browsers': 'firefox chrome' } ] } expected_result = { 'name': 'Tom', 'age': 23, 'browser_use': [ { 'browser_use/year': '2010', 'browser_use/browsers/firefox': True, 'browser_use/browsers/safari': True, 'browser_use/browsers/ie': False, 'browser_use/browsers/chrome': False }, { 'browser_use/year': '2011', 'browser_use/browsers/firefox': True, 'browser_use/browsers/safari': False, 'browser_use/browsers/ie': False, 'browser_use/browsers/chrome': True } ] } select_multiples = { 'browser_use/browsers': [ 'browser_use/browsers/firefox', 'browser_use/browsers/safari', 'browser_use/browsers/ie', 'browser_use/browsers/chrome']} result = CSVDataFrameBuilder._split_select_multiples(record, select_multiples) self.assertEqual(expected_result, result)
def test_split_select_multiples(self): self._publish_nested_repeats_form() dd = self.xform.data_dictionary() self._submit_fixture_instance("nested_repeats", "01") csv_df_builder = CSVDataFrameBuilder(self.user.username, self.xform.id_string) cursor = csv_df_builder._query_mongo() record = cursor[0] select_multiples = CSVDataFrameBuilder._collect_select_multiples(dd) result = CSVDataFrameBuilder._split_select_multiples(record, select_multiples) expected_result = { u'web_browsers/ie': True, u'web_browsers/safari': True, u'web_browsers/firefox': False, u'web_browsers/chrome': False } # build a new dictionary only composed of the keys we want to use in # the comparison result = dict([(key, result[key]) for key in result.keys() if key in expected_result.keys()]) self.assertEqual(expected_result, result) csv_df_builder = CSVDataFrameBuilder(self.user.username, self.xform.id_string, binary_select_multiples=True) result = csv_df_builder._split_select_multiples(record, select_multiples) expected_result = { u'web_browsers/ie': 1, u'web_browsers/safari': 1, u'web_browsers/firefox': 0, u'web_browsers/chrome': 0 } # build a new dictionary only composed of the keys we want to use in # the comparison result = dict([(key, result[key]) for key in result.keys() if key in expected_result.keys()]) self.assertEqual(expected_result, result)
def _csv_data_for_dataframe(self): csv_df_builder = CSVDataFrameBuilder(self.user.username, self.xform.id_string) cursor = csv_df_builder._query_mongo() return csv_df_builder._format_for_dataframe(cursor)
def get_csv_data(xform, force_last=False): def getbuff(): return StringIO.StringIO() def get_headers_from(csv_data): csv_data.seek(0) header_row = csv_data.readline() csv_data.read() return header_row.split(',') def get_csv_data_manual(xform, only_last=False, with_header=True, headers_to_use=None): # TODO: find out a better way to handle this # when form has only one submission, CSVDFB is empty. # we still want to create the BB ds with row 1 # so we extract is and CSV it. instances = Instance.objects.filter( xform=xform).order_by('-date_modified') if instances.count() == 0: raise NoRecordsFoundError else: # we should only do it for count == 1 but eh. csv_buf = getbuff() if only_last: instances = instances[0:1] rows = [instance.get_full_dict() for instance in instances] if headers_to_use is None: headers_to_use = [ key for key in rows[0].keys() if not key.startswith('_') ] w = unicodecsv.DictWriter(csv_buf, fieldnames=headers_to_use, extrasaction='ignore', lineterminator='\n', encoding='utf-8') if with_header: w.writeheader() w.writerows(rows) csv_buf.flush() if not csv_buf.len: raise NoRecordsFoundError return csv_buf.getvalue() # setup an IO stream buff = getbuff() # prepare/generate a standard CSV export. # note that it omits the current submission (if called from rest) csv_dataframe_builder = CSVDataFrameBuilder(xform.user.username, xform.id_string) try: csv_dataframe_builder.export_to(buff) if force_last: # requested to add last submission to the buffer buff.write( get_csv_data_manual(xform, only_last=True, with_header=False, headers_to_use=get_headers_from(buff))) except NoRecordsFoundError: # verify that we don't have a single submission before giving up get_csv_data_manual(xform, with_header=True) if buff.len: # rewrite CSV header so that meta fields (starting with _ or meta) # are prefixed to ensure that the dataset will be joinable to # another formhub dataset prefix = (u'%(id_string)s_%(id)s' % { 'id_string': xform.id_string, 'id': xform.id }) new_buff = getbuff() buff.seek(0) reader = unicodecsv.reader(buff, encoding='utf-8') writer = unicodecsv.writer(new_buff, encoding='utf-8') is_header = True for row in reader: if is_header: is_header = False for idx, col in enumerate(row): if col.startswith('_') or col.startswith('meta_')\ or col.startswith('meta/'): row[idx] = (u'%(prefix)s%(col)s' % { 'prefix': prefix, 'col': col }) writer.writerow(row) return new_buff.getvalue() else: raise NoRecordsFoundError
def get_csv_data(xform, force_last=False): def getbuff(): return StringIO.StringIO() def get_headers_from(csv_data): csv_data.seek(0) header_row = csv_data.readline() csv_data.read() return header_row.split(',') def get_csv_data_manual(xform, only_last=False, with_header=True, headers_to_use=None): # TODO: find out a better way to handle this # when form has only one submission, CSVDFB is empty. # we still want to create the BB ds with row 1 # so we extract is and CSV it. instances = Instance.objects.filter(xform=xform).order_by( '-date_modified') if instances.count() == 0: raise NoRecordsFoundError else: # we should only do it for count == 1 but eh. csv_buf = getbuff() if only_last: instances = instances[0:1] rows = [instance.get_full_dict() for instance in instances] if headers_to_use is None: headers_to_use = [key for key in rows[0].keys() if not key.startswith('_')] w = unicodecsv.DictWriter(csv_buf, fieldnames=headers_to_use, extrasaction='ignore', lineterminator='\n', encoding='utf-8') if with_header: w.writeheader() w.writerows(rows) csv_buf.flush() if not csv_buf.len: raise NoRecordsFoundError return csv_buf.getvalue() # setup an IO stream buff = getbuff() # prepare/generate a standard CSV export. # note that it omits the current submission (if called from rest) csv_dataframe_builder = CSVDataFrameBuilder(xform.user.username, xform.id_string) try: csv_dataframe_builder.export_to(buff) if force_last: # requested to add last submission to the buffer buff.write(get_csv_data_manual(xform, only_last=True, with_header=False, headers_to_use= get_headers_from(buff))) except NoRecordsFoundError: # verify that we don't have a single submission before giving up get_csv_data_manual(xform, with_header=True) if buff.len: # rewrite CSV header so that meta fields (starting with _ or meta) # are prefixed to ensure that the dataset will be joinable to # another formhub dataset prefix = (u'%(id_string)s_%(id)s' % {'id_string': xform.id_string, 'id': xform.id}) new_buff = getbuff() buff.seek(0) reader = unicodecsv.reader(buff, encoding='utf-8') writer = unicodecsv.writer(new_buff, encoding='utf-8') is_header = True for row in reader: if is_header: is_header = False for idx, col in enumerate(row): if col.startswith('_') or col.startswith('meta_')\ or col.startswith('meta/'): row[idx] = (u'%(prefix)s%(col)s' % {'prefix': prefix, 'col': col}) writer.writerow(row) return new_buff.getvalue() else: raise NoRecordsFoundError