Exemplo n.º 1
0
    def test_detect_dialect_using_json(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        encoding = 'utf-8'
        self.files_to_delete.append(filename)

        # Using JSON will force the sniffer to do not include ':', '}' in the
        # possible delimiters
        table = rows.Table(fields=OrderedDict([
            ('jsoncolumn1', rows.fields.JSONField),
            ('jsoncolumn2', rows.fields.JSONField),
            ]))
        table.append({
            'jsoncolumn1': '{"a": 42}',
            'jsoncolumn2': '{"b": 43}',
            })
        table.append({
            'jsoncolumn1': '{"c": 44}',
            'jsoncolumn2': '{"d": 45}',
            })
        rows.export_to_csv(table, filename, encoding=encoding)

        table = rows.import_from_csv(filename, encoding=encoding)

        self.assertEqual(table.field_names, ['jsoncolumn1', 'jsoncolumn2'])
        self.assertDictEqual(table[0].jsoncolumn1, {'a': 42})
        self.assertDictEqual(table[0].jsoncolumn2, {'b': 43})
        self.assertDictEqual(table[1].jsoncolumn1, {'c': 44})
        self.assertDictEqual(table[1].jsoncolumn2, {'d': 45})
Exemplo n.º 2
0
    def parse_state_file(self, response):
        state = response.meta["state"]

        self.errors = []
        try:
            self.parse_boletim(state, response.body)
        except Exception as exp:
            self.errors.append(
                ("boletim", state, f"{exp.__class__.__name__}: {exp}"))
        try:
            self.parse_caso(state, response.body)
        except Exception as exp:
            self.errors.append(
                ("caso", state, f"{exp.__class__.__name__}: {exp}"))
        if self.errors:
            error_counter = Counter(error[0] for error in self.errors)
            error_counter_str = ", ".join(
                f"{error_type}: {count}"
                for error_type, count in error_counter.items())
            self.logger.error(
                f"{len(self.errors)} errors found when parsing {state} ({error_counter_str})"
            )
            error_header = ("sheet", "state", "message")
            errors = rows.import_from_dicts(
                [dict(zip(error_header, row)) for row in self.errors])
            rows.export_to_csv(errors, f"errors-{state}.csv")
            exit(255)
Exemplo n.º 3
0
    def test_quotes(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.Table(fields=OrderedDict([
                    ('field_1', rows.fields.TextField),
                    ('field_2', rows.fields.TextField),
                    ('field_3', rows.fields.TextField),
                    ('field_4', rows.fields.TextField), ]))
        table.append({
            'field_1': '"quotes"',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
            })
        # we need this line row since `"quotes"` on `field_1` could be
        # `JSONField` or `TextField`
        table.append({
            'field_1': 'noquotes',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
            })
        rows.export_to_csv(table, filename)

        table2 = rows.import_from_csv(filename)
        self.assert_table_equal(table, table2)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("state", choices=spiders.keys())
    parser.add_argument("--year", type=int)
    parser.add_argument("--action", type=int)
    parser.add_argument("--quiet", action="store_true")
    parser.add_argument("--headless", action="store_true")
    args = parser.parse_args()
    if args.year is None and args.action is None:
        actions = get_actions_for_state(args.state)
    elif args.action is None:
        actions = [
            action
            for action in get_actions_for_state(args.state)
            if action.year == args.year
        ]
    else:
        actions = [
            Action(year=args.year, code=args.action, state=args.state, name="Unknown")
        ]

    spider = spiders[args.state](headless=args.headless)
    for action in actions:
        if not args.quiet:
            print(
                f"Downloading budget execution for {action.state} ({action.code} @ {action.year})"
            )
        table = spider.execute(action.year, action.code)

        output_filename = f"{action.state}-{action.year}-{action.code}.csv"
        rows.export_to_csv(table, output_filename)
        if not args.quiet:
            print(f"  done (saved to {output_filename})")
    spider.close()
    def test_detect_dialect_using_json(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        encoding = 'utf-8'
        self.files_to_delete.append(filename)

        # Using JSON will force the sniffer to do not include ':', '}' in the
        # possible delimiters
        table = rows.Table(fields=OrderedDict([
            ('jsoncolumn1', rows.fields.JSONField),
            ('jsoncolumn2', rows.fields.JSONField),
        ]))
        table.append({
            'jsoncolumn1': '{"a": 42}',
            'jsoncolumn2': '{"b": 43}',
        })
        table.append({
            'jsoncolumn1': '{"c": 44}',
            'jsoncolumn2': '{"d": 45}',
        })
        rows.export_to_csv(table, filename, encoding=encoding)

        table = rows.import_from_csv(filename, encoding=encoding)

        self.assertEqual(table.field_names, ['jsoncolumn1', 'jsoncolumn2'])
        self.assertDictEqual(table[0].jsoncolumn1, {'a': 42})
        self.assertDictEqual(table[0].jsoncolumn2, {'b': 43})
        self.assertDictEqual(table[1].jsoncolumn1, {'c': 44})
        self.assertDictEqual(table[1].jsoncolumn2, {'d': 45})
Exemplo n.º 6
0
    def parse_state_file(self, response):
        state = response.meta["state"]

        self.errors = []
        try:
            self.parse_boletim(state, response.body)
        except Exception as exp:
            self.errors.append(
                ("boletim", state, f"{exp.__class__.__name__}: {exp}"))
        try:
            self.parse_caso(state, response.body)
        except Exception as exp:
            self.errors.append(
                ("caso", state, f"{exp.__class__.__name__}: {exp}"))
        if self.errors:
            error_counter = Counter(error[0] for error in self.errors)
            error_counter_str = ", ".join(
                f"{error_type}: {count}"
                for error_type, count in error_counter.items())
            self.logger.error(
                f"{len(self.errors)} errors found when parsing {state} ({error_counter_str})"
            )
            error_header = ("sheet", "state", "message")
            errors = rows.import_from_dicts(
                [dict(zip(error_header, row)) for row in self.errors])
            filename = ERROR_PATH / f"errors-{state}.csv"
            if not filename.parent.exists():
                filename.parent.mkdir(parents=True)
            rows.export_to_csv(errors, filename)

            # Force crawler to stop
            os.kill(os.getpid(), SIGINT)
            os.kill(os.getpid(), SIGINT)
            raise CloseSpider(f"Error found on {state} (see {filename}).")
    def test_quotes(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.Table(fields=OrderedDict([
            ('field_1', rows.fields.TextField),
            ('field_2', rows.fields.TextField),
            ('field_3', rows.fields.TextField),
            ('field_4', rows.fields.TextField),
        ]))
        table.append({
            'field_1': '"quotes"',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
        })
        # we need this line row since `"quotes"` on `field_1` could be
        # `JSONField` or `TextField`
        table.append({
            'field_1': 'noquotes',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
        })
        rows.export_to_csv(table, filename)

        table2 = rows.import_from_csv(filename)
        self.assert_table_equal(table, table2)
Exemplo n.º 8
0
def create_download_script(filename=Path('data/download.sh'),
                           output_path=Path('data/output'),
                           download_path=Path('data/download')):
    if not filename.parent.exists():
        filename.parent.mkdir()
    if not output_path.exists():
        output_path.mkdir()

    links = discover_links()
    today = datetime.datetime.now()
    date = f'{today.year}-{today.month:02d}-{today.day:02d}'
    rows.export_to_csv(links, output_path / f'links_{date}.csv')

    with open(filename, mode='w', encoding='utf8') as fobj:
        fobj.write('#!/bin/sh\n')
        fobj.write(
            f'# Arquivo gerado em {today.year}-{today.month}-{today.day}\n')
        fobj.write(
            '# Visite o site da Receita Federal para verificar se existem atualizações.\n\n'
        )
        fobj.write('mkdir -p {}\n\n'.format(str(download_path)))
        for row in links:
            path = download_path / (row.uf + '.txt')
            fobj.write(f'wget -O "{path}" "{row.url}"\n')

    meta = os.stat(filename)
    os.chmod(filename, meta.st_mode | stat.S_IEXEC)
def download_and_save(names, filename):
    result = []
    for name in names:
        print(name)
        result.append(classify_by_sex(name))
    table = rows.import_from_dicts(result)
    rows.export_to_csv(table, filename)
Exemplo n.º 10
0
    def parse_state_file(self, response):
        meta = response.meta
        state = meta["state"]
        caso_filename = meta["caso_filename"]
        if response.status >= 400:
            self.errors[state].append(
                ("connection", state, f"HTTP status code: {response.status}"))
        else:
            response_data = json.load(io.BytesIO(response.body))
            try:
                self.parse_boletim(state, response_data["reports"])
            except Exception as exp:
                self.errors[state].append(
                    ("boletim", state, f"{exp.__class__.__name__}: {exp}"))
            try:
                self.parse_caso(state, caso_filename, response_data["cases"])
            except Exception as exp:
                self.errors[state].append(
                    ("caso", state, f"{exp.__class__.__name__}: {exp}"))

        if self.errors[state]:
            error_counter = Counter(error[0] for error in self.errors[state])
            error_counter_str = ", ".join(
                f"{error_type}: {count}"
                for error_type, count in error_counter.items())
            self.logger.error(
                f"{len(self.errors[state])} errors found when parsing {state} ({error_counter_str})"
            )
            error_header = ("sheet", "state", "message")
            errors = rows.import_from_dicts(
                [dict(zip(error_header, row)) for row in self.errors[state]])
            filename = ERROR_PATH / f"errors-{state}.csv"
            if not filename.parent.exists():
                filename.parent.mkdir(parents=True)
            rows.export_to_csv(errors, filename)
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('html_entrada')
    parser.add_argument('csv_saida')
    args = parser.parse_args()

    table = sum_iof_into_entries(html_to_table(args.html_entrada))
    rows.export_to_csv(table, args.csv_saida)
Exemplo n.º 12
0
    def test_export_to_csv_filename(self):
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        rows.export_to_csv(utils.table, temp.name)

        table = rows.import_from_csv(temp.name)
        self.assert_table_equal(table, utils.table)
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('html_entrada')
    parser.add_argument('csv_saida')
    args = parser.parse_args()

    table = sum_iof_into_entries(html_to_table(args.html_entrada))
    rows.export_to_csv(table, args.csv_saida)
Exemplo n.º 14
0
def create_final_headers(header_type, order_columns, final_filename):
    final_headers = {}
    filenames = sorted(
        [
            (REGEXP_HEADER_YEAR.findall(filename)[0], filename)
            for filename in glob(str(settings.HEADERS_PATH / f"{header_type}-*.csv"))
            if REGEXP_HEADER_YEAR.findall(filename)
        ]
    )
    # TODO: check if schema is according to final header. if there are diffs,
    # warn user.
    for index, (header_year, filename) in enumerate(filenames):
        header = read_header(filename)
        for row in header:
            if not row.nome_final:
                continue
            if row.nome_final not in final_headers:
                row_data = row._asdict()
                if index > 0:
                    row_data["introduced_on"] = header_year
                row_data["original_names"] = [(header_year, row_data.pop("nome_tse"))]
                final_headers[row.nome_final] = row_data
            else:
                original_name = (header_year, row.nome_tse)
                original_names = final_headers[row.nome_final]["original_names"]
                should_add = True
                for original in original_names:
                    if original[1] == original_name[1]:
                        should_add = False
                        break
                if should_add:
                    original_names.append(original_name)

    table = rows.Table(
        fields=OrderedDict(
            [
                ("nome_final", rows.fields.TextField),
                ("descricao", rows.fields.TextField),
            ]
        )
    )

    header_list = sorted(
        final_headers.values(), key=lambda row: order_columns(row["nome_final"])
    )
    for row in header_list:
        row_data = {"descricao": row["descricao"] or "", "nome_final": row["nome_final"]}
        introduced_on = row.get("introduced_on", None)
        original_names = ", ".join(
            f"{item[1]} ({item[0]})" for item in row.get("original_names")
        )
        row_data["descricao"] += f". Aparece no TSE como: {original_names}"
        if introduced_on:
            row_data["descricao"] += f". Coluna adicionada em {introduced_on}"
        if row_data["descricao"][-1] != ".":
            row_data["descricao"] += "."
        table.append(row_data)
    rows.export_to_csv(table, final_filename)
 def test_export_callback(self):
     table = rows.import_from_dicts([{
         'id': number
     } for number in range(10)])
     myfunc = mock.Mock()
     rows.export_to_csv(table, callback=myfunc, batch_size=3)
     self.assertEqual(myfunc.call_count, 4)
     self.assertEqual([x[0][0] for x in myfunc.call_args_list],
                      [3, 6, 9, 10])
Exemplo n.º 16
0
	def write_csv(self):

		filename = OUTPUT_PATH / Path("caso-sc.csv")

		# Adicionando a 'row' do estado de SC
		data = {}
		data["date"] = date.today()
		data["state"] = "SC"
		data["city"] = ""
		data["place_type"] = "state"
		data["notified"] = sum([self.data_cities[i]["notified"] for i in range(len(self.data_cities))])
		data["confirmed"] = sum([self.data_cities[i]["confirmed"] for i in range(len(self.data_cities))])
		data["discarded"] = sum([self.data_cities[i]["discarded"] for i in range(len(self.data_cities))])
		data["suspect"] = sum([self.data_cities[i]["suspect"] for i in range(len(self.data_cities))])
		data["deaths"] = sum([self.data_cities[i]["deaths"] for i in range(len(self.data_cities))])

		data["city_ibge_code"] = ""
		data["estimated_population_2019"] = sum([self.data_cities[i]["estimated_population_2019"] for i in range(len(self.data_cities))])
		data["confirmed_per_100k_inhabitants"] = sum([self.data_cities[i]["confirmed_per_100k_inhabitants"] for i in range(len(self.data_cities))])
		data["death_rate"] = sum([self.data_cities[i]["death_rate"] for i in range(len(self.data_cities))]) / len(self.data_cities)
		data["notes"] = ""
		data["source_url"] = ""
		self.data_cities.append(data)

		# Adicionando as cidades que nao foram parseadas ainda
		for each_city in self.cities_sc_ibge:
			if each_city.municipio not in self.cidades_url.keys():
				data = {}
				data["date"] = ""
				data["state"] = "SC"
				data["city"] = each_city.municipio
				data["place_type"] = "city"
				data["notified"] = ""
				data["confirmed"] = ""
				data["discarded"] = ""
				data["suspect"] = ""
				data["deaths"] = ""
				data["notes"] = ""
				data["city_ibge_code"] = ""
				data["estimated_population_2019"] = ""
				data["confirmed_per_100k_inhabitants"] = ""
				data["death_rate"] = ""
				data["source_url"] = ""
				self.data_cities.append(data)


		# keyorder = ['date', 'state', 'city', 'place_type', 'notified', 'confirmed', 'discarded', 'suspect', 'deaths', 'notes', 'city_ibge_code', 'estimated_population_2019', 'confirmed_per_100k_inhabitants', 'death_rate', 'source_url']
		#
		# data_cities_ordered = []
		# for d in self.data_cities:
		# 	data_cities_ordered.append(sorted(d.items(), key=lambda i:keyorder.index(i[0])))

		rows_data = rows.import_from_dicts(self.data_cities)

		rows_data.order_by("city")

		rows.export_to_csv(rows_data, filename)
Exemplo n.º 17
0
    def test_import_field_limit(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = "{}.{}".format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.import_from_dicts([{"f1": "a" * 132000}])
        rows.export_to_csv(table, filename)

        # The following line must not raise the exception:
        # `_csv.Error: field larger than field limit (131072)`
        new = rows.import_from_csv(filename)
Exemplo n.º 18
0
    def test_issue_168(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = "{}.{}".format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.Table(fields=OrderedDict([("jsoncolumn", rows.fields.JSONField)]))
        table.append({"jsoncolumn": '{"python": 42}'})
        rows.export_to_csv(table, filename)

        table2 = rows.import_from_csv(filename)
        self.assert_table_equal(table, table2)
Exemplo n.º 19
0
    def export_prob_predictions_to_csv(self, filename, ids, predictions):
        new_rows = []
        
        # classifier = {
        #     'Return_to_owner': 0,
        #     'Euthanasia': 1,
        #     'Adoption': 2,
        #     'Transfer': 3,
        #     'Died': 4
        # }
        #print(predictions)
        #count = [0, 0, 0, 0]
        #m = []
        for i, prediction in enumerate(predictions):
            #ID	Adoption	Died	Euthanasia	Return_to_owner	Transfer
            # print(type(prediction))
            # print(prediction == '3')
            # print(int(prediction == '3'))
            
            new_row = OrderedDict({})
            # print(prediction[0])
            # print(type(prediction[0]))

            # print numpy.argmax(prediction)
            # m.append(numpy.argmax(prediction))
            # if numpy.argmax(prediction) == 0:
            #     count[0]+=1
            # if numpy.argmax(prediction) == 1:
            #     count[1]+=1
            # if numpy.argmax(prediction) == 2:
            #     count[2]+=1
            # if numpy.argmax(prediction) == 3:
            #     count[3]+=1
            
            new_row['ID'] = ids[i]
            new_row['Adoption'] = prediction[2]
            new_row['Died'] = prediction[4]
            new_row['Euthanasia'] = prediction[1]
            new_row['Return_to_owner'] = prediction[0]
            new_row['Transfer'] = prediction[3]
            
            new_rows.append(new_row)
        #print(count)
        #print(set(m))
        new_rows.sort(key=lambda e: e['ID'])
        #print(new_rows)
        
        new_fields = [(key, rows.fields.UnicodeField) for key in new_rows[0].keys()]
        table_to = rows.Table(fields=OrderedDict(new_fields))
        for row in new_rows:
            table_to.append(row)
            
        rows.export_to_csv(table_to, filename)
Exemplo n.º 20
0
    def test_export_to_csv_uses_serialize(self, mocked_serialize):
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        kwargs = {"test": 123, "parameter": 3.14}
        mocked_serialize.return_value = iter([utils.table.fields.keys()])

        rows.export_to_csv(utils.table, temp.name, encoding="utf-8", **kwargs)
        self.assertTrue(mocked_serialize.called)
        self.assertEqual(mocked_serialize.call_count, 1)

        call = mocked_serialize.call_args
        self.assertEqual(call[0], (utils.table,))
        self.assertEqual(call[1], kwargs)
    def test_export_to_csv_filename(self):
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        rows.export_to_csv(utils.table, temp.name)

        table = rows.import_from_csv(temp.name)
        self.assert_table_equal(table, utils.table)

        temp.file.seek(0)
        result = temp.file.read()
        export_in_memory = rows.export_to_csv(utils.table, None)
        self.assertEqual(result, export_in_memory)
Exemplo n.º 22
0
    def test_export_to_csv_filename(self):
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        rows.export_to_csv(utils.table, temp.name)

        table = rows.import_from_csv(temp.name)
        self.assert_table_equal(table, utils.table)

        temp.file.seek(0)
        result = temp.file.read()
        export_in_memory = rows.export_to_csv(utils.table, None)
        self.assertEqual(result, export_in_memory)
Exemplo n.º 23
0
    def test_export_to_csv_uses_serialize(self, mocked_serialize):
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        kwargs = {'test': 123, 'parameter': 3.14, }
        mocked_serialize.return_value = iter([utils.table.fields.keys()])

        rows.export_to_csv(utils.table, temp.name, encoding='utf-8', **kwargs)
        self.assertTrue(mocked_serialize.called)
        self.assertEqual(mocked_serialize.call_count, 1)

        call = mocked_serialize.call_args
        self.assertEqual(call[0], (utils.table, ))
        self.assertEqual(call[1], kwargs)
Exemplo n.º 24
0
def main():
    now = datetime.datetime.now()
    today = datetime.date(now.year, now.month, now.day)
    download_path = pathlib.Path('download')
    output_path = pathlib.Path('output')
    if not download_path.exists():
        download_path.mkdir()
    if not output_path.exists():
        output_path.mkdir()

    # Get spreadsheet links
    links = get_links(date=today)
    rows.export_to_csv(links, output_path / f'links-{today}.csv')

    # Download all the links
    result = []
    for link in links:
        print(link.name)

        filename = download_path / urlparse(link.url).path.split('/')[-1]

        # Download file
        print(f'  Downloading ({link.url})...', end='', flush=True)
        if filename.exists():
            print(f' already downloaded.')
        else:
            try:
                download(link.url, filename)
            except RuntimeError as exception:
                print(f' {exception.args[0]}')
                continue
            else:
                print(' done.')

        # Extract data
        print(f'  Extracting ({filename})...', end='', flush=True)
        try:
            data = extract(filename, link)
        except Exception as exp:
            import traceback
            print(f' ERROR! {traceback.format_exc().splitlines()[-1]}')
        else:
            print(f' done (rows extracted: {len(data)}).')
            result.extend(data)

    # Extract everything to a new CSV
    output = output_path / f'salarios-magistrados-{today}.csv'
    print(f'Extracting result to {output}...')
    export_csv(result, output)
def pdf_to_csv(input_filename, output_filename):
    total_pages = rows.plugins.pdf.number_of_pages(input_filename)
    pdf = rows.plugins.pdf.PyMuPDFBackend(input_filename)
    result = []
    for page_number in range(1, total_pages + 1):
        page = list(next(pdf.objects(page_numbers=(page_number, ))))
        data = list(rows.plugins.utils.ipartition(page, 4))
        header = [obj.text for obj in data[0]]
        for row in data[1:]:
            row = dict(zip(header, [obj.text for obj in row]))
            row["codigo_ibge"] = row.pop("IBGE")
            row["perfil"] = row.pop("Perfil Município")
            result.append(row)
    table = rows.import_from_dicts(result)
    rows.export_to_csv(table, output_filename)
Exemplo n.º 26
0
    def test_import_from_xpath_filename(self):
        table = rows.import_from_xpath(self.filename,
                                       encoding=self.encoding,
                                       **self.kwargs)

        expected_meta = {'imported_from': 'xpath', 'filename': self.filename,}
        self.assertEqual(table.meta, expected_meta)

        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        fobj = temp.file
        rows.export_to_csv(table, fobj)
        fobj.seek(0)
        table = rows.import_from_csv(fobj)

        self.assert_table_equal(table, self.expected_table)
 def __call__(self):
     view = getMultiAdapter((self.context, self.request), name='view')
     table = view.table()
     filename = "%s.csv" % view.filename_prefix()
     data = rows.export_to_csv(table)
     self.request.response.setHeader('Content-Type', '"%s"' % EXTENSIONS_TYPES.get('csv'))
     self.request.response.setHeader('Content-Disposition', 'attachment; filename="%s"' % filename)
     return data
Exemplo n.º 28
0
def convert(state, input_filename, output_filename):
    table = rows.import_from_csv(
        input_filename,
        force_types={
            "confirmed": rows.fields.IntegerField,
            "deaths": rows.fields.IntegerField,
        },
    )
    state_cities = ["TOTAL NO ESTADO", "Importados/Indefinidos"] + sorted(
        row.municipio for row in cities if row.uf == state
    )
    confirmed, deaths, dates = {}, {}, []
    for row in table:
        row_confirmed = row.confirmed or 0
        row_date = row.date
        row_deaths = row.deaths or 0
        row_name = row.city if row.place_type == "city" else "TOTAL NO ESTADO"

        if row_name not in state_cities:
            print(f"ERRO: município {repr(row_name)} não encontrado.")
            continue
        if row_confirmed == 0 and row_deaths == 0:
            # No data for this city in this day
            continue
        if row_date not in confirmed:
            confirmed[row_date] = {}
        if row_date not in deaths:
            deaths[row_date] = {}
        if row_name in confirmed[row_date] or row_name in deaths[row_date]:
            print(f"ERRO: conflito em {repr(row_name)} para {row_date}.")
            continue

        confirmed[row_date][row_name] = row_confirmed
        deaths[row_date][row_name] = row_deaths

    result = []
    dates = sorted(confirmed.keys(), reverse=True)
    for city in state_cities:
        row = {"municipio": city}
        for date in dates:
            date_str = f"{date.day:02d}_{date.month:02d}"
            row[f"confirmados_{date_str}"] = confirmed[date].get(city, None)
            row[f"mortes_{date_str}"] = deaths[date].get(city, None)
        result.append(row)
    rows.export_to_csv(rows.import_from_dicts(result), output_filename)
Exemplo n.º 29
0
def main():
    now = datetime.datetime.now()
    today = datetime.date(now.year, now.month, now.day)
    download_path = pathlib.Path('download')
    output_path = pathlib.Path('output')
    if not download_path.exists():
        download_path.mkdir()
    if not output_path.exists():
        output_path.mkdir()

    # Get spreadsheet links
    links = get_links(date=today)
    rows.export_to_csv(links, output_path / f'links-{today}.csv')

    # Download all the links
    filenames = []
    for link in links:
        save_path = download_path / urlparse(link.url).path.split('/')[-1]
        filenames.append(save_path)
        if not save_path.exists():
            print(f'Downloading {link.url}...', end='', flush=True)
            download(link.url, save_path)
            print(' done.')
        else:
            print(f'Skipping {save_path.name}...')

    # Extract data from all the spreadsheets
    result = []
    for filename in filenames:
        print(f'Extracting {filename.name}...', end='', flush=True)
        try:
            data = extract(filename)
        except Exception as exp:
            import traceback
            print(f' ERROR! {traceback.format_exc().splitlines()[-1]}')
        else:
            print(' done.')
            result.extend(data)

    # Extract everything to a new CSV
    output = output_path / f'salarios-magistrados-{today}.csv'
    print(f'Extracting result to {output}...')
    export_csv(result, output)
Exemplo n.º 30
0
    def test_import_from_xpath_fobj(self):
        # TODO: may test with codecs.open passing an encoding
        with open(self.filename, mode='rb') as fobj:
            table = rows.import_from_xpath(fobj,
                                           encoding=self.encoding,
                                           **self.kwargs)

        expected_meta = {'imported_from': 'xpath',
                         'filename': self.filename,
                         'encoding': self.encoding, }
        self.assertEqual(table.meta, expected_meta)

        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        fobj = temp.file
        rows.export_to_csv(table, fobj)
        fobj.seek(0)
        table = rows.import_from_csv(fobj)

        self.assert_table_equal(table, self.expected_table)
Exemplo n.º 31
0
def download_game_data_for_country(path, year, country_code):
    'Download country athlete data for a specific year if not downloaded yet'

    filename = path.joinpath(_make_filename(year, country_code))
    if filename.exists():
        print(' (already downloaded, skipping)')
        return

    url = URL_DATA.format(year=year, country_code=country_code)
    response = requests.get(url)
    if '404' in response.url:  # country didn't played this year
        print(" (didn't play this year, skipping)")
        return

    html = response.content
    table = rows.import_from_html(BytesIO(html),
                                  encoding='utf-8',
                                  fields=FIELDS)
    rows.export_to_csv(table, str(filename.absolute()), encoding='utf-8')
    print(' ok')
Exemplo n.º 32
0
    def export_exact_predictions_to_csv(self, filename, ids, predictions):
        new_rows = []
        
        # classifier = {
        #     'Return_to_owner': 0,
        #     'Euthanasia': 1,
        #     'Adoption': 2,
        #     'Transfer': 3,
        #     'Died': 4
        # }

        for i, prediction in enumerate(predictions):
            #ID	Adoption	Died	Euthanasia	Return_to_owner	Transfer
            # print(type(prediction))
            # print(prediction == '3')
            # print(int(prediction == '3'))
            
            new_row = OrderedDict({})
            new_row['ID'] = ids[i]
            new_row['Adoption'] = int(prediction == '2')
            new_row['Died'] = int(prediction == '4')
            new_row['Euthanasia'] = int(prediction == '1')
            new_row['Return_to_owner'] = int(prediction == '0')
            new_row['Transfer'] = int(prediction == '3')
            
            new_rows.append(new_row)
            
        new_rows.sort(key=lambda e: e['ID'])
        #print(new_rows)
        
        new_fields = [(key, rows.fields.UnicodeField) for key in new_rows[0].keys()]
        table_to = rows.Table(fields=OrderedDict(new_fields))
        for row in new_rows:
            table_to.append(row)
            
        rows.export_to_csv(table_to, filename)
Exemplo n.º 33
0
def parse_licitacao(year, city_code):
    # TODO: adicionar codibge, nome municipio e ano nos 3 arquivos
    filename = DOWNLOAD_PATH / f"{year}_{city_code}_Licitacao.zip"

    result1 = parse(filename, "licitacao")
    rows.export_to_csv(result1, OUTPUT_PATH / f"licitacao-{city_code}-{year}.csv")

    result2 = parse(filename, "licitacao_participante")
    rows.export_to_csv(
        result2, OUTPUT_PATH / f"licitacao-participante-{city_code}-{year}.csv"
    )

    result3 = parse(filename, "licitacao_vencedor")
    rows.export_to_csv(
        result3, OUTPUT_PATH / f"licitacao-vencedor-{city_code}-{year}.csv"
    )
    if row.animaltype == 'Dog':
        new_row.update(get_dog_age_columns(row))
        # Demora muito
        #new_row.update(get_dog_breed_columns(row))
        #new_row.update(get_dog_color_columns(row))
        
        new_row['outcome'] = get_animal_outcome(row)
        
        new_dog_rows.append(new_row)

new_fields = [(key, rows.fields.UnicodeField) for key in new_cat_rows[0].keys()]
table_to = rows.Table(fields=OrderedDict(new_fields))
for row in new_cat_rows:
    table_to.append(row)
    
rows.export_to_csv(table_to, "clean_data3_no_color_no_breed_cat.csv")

new_fields = [(key, rows.fields.UnicodeField) for key in new_dog_rows[0].keys()]
table_to = rows.Table(fields=OrderedDict(new_fields))
for row in new_dog_rows:
    table_to.append(row)
    
rows.export_to_csv(table_to, "clean_data3_no_color_no_breed_dog.csv")

##################
# Limpando dados de teste
##################


table_from = rows.import_from_csv("../test.csv")
Exemplo n.º 35
0
import requests

import rows

url = "http://balneabilidade.inema.ba.gov.br/index.php/relatoriodebalneabilidade/geraBoletim?idcampanha=42041"
print("*** Downloading PDF...")
response = requests.get(url)

# The line below will automatically identify the table in all PDF pages - it
# works for this file but not for all cases. You can be more specific defining
# the page numbers, a start/end string (like the header/footer strings) and
# also change the table identification algorithm. Check `backend`, `algorithm`,
# `starts_after`, `ends_before` and `page_numbers` parameters.
# For this simple case you could also install rows' CLI (`pip install
# rows[cli]`) and run: `rows print <url>`
table = rows.import_from_pdf(io.BytesIO(response.content))
rows.export_to_csv(table, "beach-data.csv")
print("*** Table exported to beach-data.csv")

print("*** Extracted table:")
print(rows.export_to_txt(table))

# You could also iterate over the object, like:
# for row in table: print(row)


print("\n\n*** Extracted text:")
text_pages = rows.plugins.pdf.pdf_to_text(io.BytesIO(response.content))
print("\n\n".join(text_pages))
Exemplo n.º 36
0

# Get data from Portuguese Wikipedia
city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil'
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(
        BytesIO(html),
        rows_xpath='//table/tr/td/ul/li',
        fields_xpath=OrderedDict([('name', './/text()'),
                                  ('link', './/a/@href')]))

regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)')

def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link'])
    data['name'], data['state'] = regexp_city_state.findall(data['name'])[0]
    return data

new_fields = OrderedDict()
new_fields['name'] = cities.fields['name']
new_fields['state'] = rows.fields.TextField  # new field
new_fields['link'] = cities.fields['link']
cities = rows.transform(new_fields, transform, cities)
rows.export_to_csv(cities, 'brazilian-cities.csv')
Exemplo n.º 37
0
table_2 = rows.Table(fields=new_fields)
for row in table_1:
    if row.sexuponoutcome and row.sexuponoutcome != u'Unknown':
        castration, sex = row.sexuponoutcome.split()
    else:
        castration, sex = u'Unknown', u'Unknown'
    
    week_day = calendar.day_name[row.datetime.weekday()]
    us_holidays = holidays.UnitedStates()
    holiday = row.datetime in us_holidays
    age_in_days = get_animal_age(row.ageuponoutcome)
    if row.animaltype == "Cat":
        age_group = get_cat_age_group(age_in_days)
    else:
        age_group = get_dog_age_group(age_in_days)
    table_2.append({'animalid': row.animalid,
                    'name': row.name,
                    'datetime': week_day,
                    'holiday': holiday,
                    'outcometype': row.outcometype,
                    'outcomesubtype': row.outcomesubtype,
                    'animaltype': row.animaltype,
                    'sex': sex,
                    'castration': castration,
                    'agegroup': age_group,
                    'breed': row.breed,
                    'color': row.color})

rows.export_to_csv(table_2, "clean_data.csv")
Exemplo n.º 38
0
# coding: utf-8

from __future__ import unicode_literals

import os
from collections import OrderedDict

import rows

# taken from:
# http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras
filename = os.path.join(
    os.path.dirname(__file__), "../../tests/data/ecuador-medios-radiodifusoras.html"
)
rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]'
fields_xpath = OrderedDict(
    [
        ("url", ".//h2/a/@href"),
        ("name", ".//h2/a/text()"),
        ("address", './/div[@class="spField field_direccion"]/text()'),
        ("phone", './/div[@class="spField field_telefono"]/text()'),
        ("website", './/div[@class="spField field_sitio_web"]/text()'),
        ("email", './/div[@class="spField field_email"]/text()'),
    ]
)

table = rows.import_from_xpath(filename, rows_xpath, fields_xpath)
rows.export_to_csv(table, "ecuador-radiodifusoras.csv")
Exemplo n.º 39
0
# coding: utf-8

from __future__ import unicode_literals

import os

from collections import OrderedDict

import rows

# taken from:
# http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras
filename = os.path.join(os.path.dirname(__file__),
                        '../../tests/data/ecuador-medios-radiodifusoras.html')
rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]'
fields_xpath = OrderedDict([
        ('url', './/h2/a/@href'),
        ('name', './/h2/a/text()'),
        ('address', './/div[@class="spField field_direccion"]/text()'),
        ('phone', './/div[@class="spField field_telefono"]/text()'),
        ('website', './/div[@class="spField field_sitio_web"]/text()'),
        ('email', './/div[@class="spField field_email"]/text()'), ])

table = rows.import_from_xpath(filename, rows_xpath, fields_xpath)
rows.export_to_csv(table, 'ecuador-radiodifusoras.csv')
Exemplo n.º 40
0
    drow = row.__dict__
    
    for key, value in drow.items():
        if key not in statistics:
            statistics[key] = {}
        
        if value not in statistics[key]:
            statistics[key][value] = 0
        statistics[key][value] += 1

string = rows.fields.UnicodeField

columns = {}
columns['value'] = string
for key in statistics.keys():
    for value in statistics[key].keys():
        columns[key + '_' + value] = string
        
table_2d_analize = rows.Table(fields=columns)
drows = map(lambda r: r.__dict__, table)
for key in statistics.keys():
    for value in statistics[key].keys():
        data = {}
        data['value'] = key + '_' + value
        for key2 in statistics.keys():
            for value2 in statistics[key2].keys():
                data[key2 + '_' + value2] = len(filter(lambda d: d[key] == value and d[key2] == value2, drows))
        table_2d_analize.append(data)
        
rows.export_to_csv(table_2d_analize, '2d_column_analize.csv')
                    
 def test_export_to_csv_accepts_dialect(self):
     result_1 = rows.export_to_csv(utils.table, dialect=csv.excel_tab)
     result_2 = rows.export_to_csv(utils.table, dialect=csv.excel)
     self.assertEqual(result_1.replace(b'\t', b','), result_2)
Exemplo n.º 42
0
    if row.animaltype == 'Dog':
        new_row.update(get_dog_age_columns(row))
        # Demora muito
        new_row.update(get_dog_breed_columns(row))
        new_row.update(get_dog_color_columns(row))
        
        new_row['outcome'] = get_animal_outcome(row)
        
        new_dog_rows.append(new_row)

new_fields = [(key, rows.fields.UnicodeField) for key in new_cat_rows[0].keys()]
table_to = rows.Table(fields=OrderedDict(new_fields))
for row in new_cat_rows:
    table_to.append(row)
    
rows.export_to_csv(table_to, "clean_data3_cat.csv")

new_fields = [(key, rows.fields.UnicodeField) for key in new_dog_rows[0].keys()]
table_to = rows.Table(fields=OrderedDict(new_fields))
for row in new_dog_rows:
    table_to.append(row)
    
rows.export_to_csv(table_to, "clean_data3_dog.csv")

##################
# Limpando dados de teste
##################


table_from = rows.import_from_csv("../test.csv")
Exemplo n.º 43
0
# Get data from Portuguese Wikipedia
city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil"
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(
    BytesIO(html),
    rows_xpath="//table/tr/td/ul/li",
    fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]),
)

regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)")


def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data["link"] = urljoin("https://pt.wikipedia.org", data["link"])
    data["name"], data["state"] = regexp_city_state.findall(data["name"])[0]
    return data


new_fields = OrderedDict()
new_fields["name"] = cities.fields["name"]
new_fields["state"] = rows.fields.TextField  # new field
new_fields["link"] = cities.fields["link"]
cities = rows.transform(new_fields, transform, cities)
rows.export_to_csv(cities, "brazilian-cities.csv")
def csv_claims_by_state():
    counter = count_claims_by_state()
    result = import_from_dicts(counter)
    result.order_by('label')
    return rows.export_to_csv(result)
Exemplo n.º 45
0
 def test_export_callback(self):
     table = rows.import_from_dicts([{"id": number} for number in range(10)])
     myfunc = mock.Mock()
     rows.export_to_csv(table, callback=myfunc, batch_size=3)
     self.assertEqual(myfunc.call_count, 4)
     self.assertEqual([x[0][0] for x in myfunc.call_args_list], [3, 6, 9, 10])
Exemplo n.º 46
0
        if value not in statistics[key]:
            statistics[key][value] = 0
        statistics[key][value] += 1

string = rows.fields.UnicodeField
table_output = rows.Table(fields=OrderedDict({'column': string, 'value': string, 'amount': string, 'percent': string}))
for key in statistics.keys():
    for value in statistics[key].keys():
        table_output.append({
            'column': key,
            'value': value,
            'amount': statistics[key][value],
            'percent': "{0:.2f}".format(statistics[key][value] / quantidade_de_exemplos * 100)
        })
        
rows.export_to_csv(table_output, '1d_column_analize.csv')

# columns = {}
# columns['value'] = string
# for key in statistics.keys():
#     for value in statistics[key].keys():
#         columns[key + '_' + value] = string
        
# table_2d_analize = rows.Table(fields=columns)
# drows = map(lambda r: r.__dict__, table)
# for key in statistics.keys():
#     for value in statistics[key].keys():
#         data = {}
#         data['value'] = key + '_' + value
#         for key2 in statistics.keys():
#             for value2 in statistics[key2].keys():
Exemplo n.º 47
0
import rows

extract_links = rows.plugins.html.extract_links
extract_text = rows.plugins.html.extract_text

# Get the HTML
url = "http://wnpp.debian.net/"
response = requests.get(url)
html = response.content

# Import data, preserving cell's HTML
packages = rows.import_from_html(BytesIO(html), index=10, preserve_html=True)


def transform(row, table):
    'Extract links from "project" field and remove HTML from all'

    data = row._asdict()
    data["links"] = " ".join(extract_links(row.project))
    for key, value in data.items():
        if isinstance(value, six.text_type):
            data[key] = extract_text(value)
    return data


new_fields = packages.fields.copy()
new_fields["links"] = rows.fields.TextField
packages = rows.transform(new_fields, transform, packages)

rows.export_to_csv(packages, "debian-wnpp.csv")
Exemplo n.º 48
0
import requests
import rows
import six

extract_links = rows.plugins.html.extract_links
extract_text = rows.plugins.html.extract_text

# Get the HTML
url = 'http://wnpp.debian.net/'
response = requests.get(url)
html = response.content

# Import data, preserving cell's HTML
packages = rows.import_from_html(BytesIO(html), index=10, preserve_html=True)

def transform(row, table):
    'Extract links from "project" field and remove HTML from all'

    data = row._asdict()
    data['links'] = ' '.join(extract_links(row.project))
    for key, value in data.items():
        if isinstance(value, six.text_type):
            data[key] = extract_text(value)
    return data

new_fields = packages.fields.copy()
new_fields['links'] = rows.fields.TextField
packages = rows.transform(new_fields, transform, packages)

rows.export_to_csv(packages, 'debian-wnpp.csv')
Exemplo n.º 49
0
 def test_export_to_csv_accepts_dialect(self):
     result_1 = rows.export_to_csv(utils.table, dialect=csv.excel_tab)
     result_2 = rows.export_to_csv(utils.table, dialect=csv.excel)
     self.assertEqual(result_1.replace(b'\t', b','), result_2)
def csv_claims_by_tag():
    counter = count_claims_by_tag()
    result = import_from_dicts(counter)
    result.order_by('count')
    return rows.export_to_csv(result)
Exemplo n.º 51
0
    ('animaltype', rows.fields.UnicodeField),
    ('sex', rows.fields.UnicodeField),
    ('castration', rows.fields.UnicodeField),
    #('ageuponoutcome', rows.fields.UnicodeField),
    #('breed', rows.fields.UnicodeField),
    #('color', rows.fields.UnicodeField)
])

table_3 = rows.Table(fields=table3_fields)
for row in table_2:
    if len(row.name):
        has_name = 'Yes' 
    else: 
        has_name = 'No'
    
    if (row.datetime == "Sunday" or row.datetime == "Saturday" or row.holiday == 'True'): 
        free_day = True 
    else: 
        free_day = False
    
    table_3.append({
        'has_name': has_name,
        'free_day': free_day,
        'outcometype': row.outcometype,
        'animaltype': row.animaltype,
        'sex': row.sex,
        'castration': row.castration,
    })

rows.export_to_csv(table_3, "clean_data2.csv")