def run(self): rows = parse(self.input().path, self.struct, skiprows=self.skiptop, usecols=self.usecolumns) oked_transform_rows(rows) save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
def run(self): for target in self.input(): rows = parse(target.path, self.struct, skiprows=self.skiptop, usecols=self.usecolumns) save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
def run(self): for i, target in enumerate(self.input()): self.set_status_message('Parsing {}'.format(target.path)) rows = parse(target.path, CompanieRow, skiprows=self.skiptop, sheets=self.sheets) save_csvrows(self.output().path, [attr.astuple(r) for r in rows]) percent = round((i + 1) * 100 / len(self.input())) self.set_progress_percentage(percent)
def run(self): r = get(self.url) soup = bs(encode(r, encoding='utf-8'), 'lxml') table = soup.find('table') rows = table.find_all('tr') csvrows = [] for tr in rows: values = tr.find_all('td') rate = values[-2].text code = values[-3].text.split('/')[0].strip() date = datetime.strftime(datetime.today(), DEFAULT_DATE_FORMAT) csvrows.append((code, rate, date)) save_csvrows(self.output().path, csvrows)
def parse_excel_rect_area_to_csv(xl_fpath, csv_fpath, wrapper, sheets=None, skiptopnum=None, usecols=None): """ Save records parsed from excel file to csv """ # get list of sheets xl_df = pd.ExcelFile(xl_fpath) xl_sheets = xl_df.sheet_names _sheets = sheets # by default we parse all the sheets if not sheets: _sheets = [i for i, _ in enumerate(range(len(xl_sheets)))] # init skiptoprows # by default we always skip one row from the top _skiptopnums = [1 for x in range(len(_sheets))] # and by now if skiptopnums is given # it will be applied only to first sheet if skiptopnum: _skiptopnums[0] = skiptopnum count = 0 for i, sh in enumerate(_sheets): if sh <= len(xl_sheets) - 1: df = pd.read_excel(xl_fpath, sheet_name=xl_sheets[sh], skiprows=_skiptopnums[i], usecols=usecols, index_col=None, dtype=str, header=None) # convert Excel's empty cells to empty string data = df.replace(np.nan, '', regex=True) data.dropna(inplace=True) rows = [wrapper(*x) for x in data.values] if len(rows) > 0: save_csvrows(csv_fpath, [attr.astuple(r) for r in rows]) count += len(rows) return count
def parse_report(apikey: str, report_name: str, chunk_size: int, output_fpath: str, parsed_fpath: str, struct, version: str, timeout: int, query=None, progress_callback=None) -> Dict: total_rows_count = get_total_rows_for_version(version) all_chunks = compute_chunks(total_rows_count, chunk_size) parsed_chunks = read_parsed_chunks(parsed_fpath) chunks = deque(prepare_chunks(all_chunks, parsed_chunks)) total_chunks = len(chunks) parsed_chunks_count = len(parsed_chunks) parsed_rows_count = parsed_chunks_count * chunk_size while chunks: chunk = chunks.popleft() url = build_url_for_data_page(report_name, apikey, version=version, query=query) try: data = load_data(url, struct) except (HTTPError, ConnectionError, Timeout, RetryError, ReadTimeout) as e: chunks.append(chunk) sleep(timeout) else: if not data: break parsed_rows_count += len(data) parsed_chunks_count += 1 save_csvrows(output_fpath, data) if parsed_chunks: append_file(parsed_fpath, str(chunk)) sleep(timeout) if progress_callback: s, p = progress_status_info(report_name, version, total_chunks, parsed_chunks_count) progress_callback(s, p)
def run(self): raw = get(self.url) dicts = parse_json_from_js(raw, self.pattern) rows = [attr.astuple(MrpRow(**_d)) for _d in dicts] save_csvrows(self.output().path, rows)
def goszakup_rest_parsing(csv_fpath: str, url: str, token: str, timeout: int, limit: int, struct=None, prs_fpath=None, callb_luigi_status=None): """ Save rows with data parsed from REST service of goszakup.gov.kz to given csv file. Supports retry. Every response(json) contains a url to the next page(uri) for parsing("next_page" key) Limitation automatily will appear in uri once we specified it at start, like - /v3/rnu?page=next&limit=500&search_after=100827 So, after fall, next time parsing will be started with uri presented by last one in .prs file. """ def load_and_parse(_url): raw = get(_url, headers=headers, timeout=timeout) # Box has the method with same name raw = raw.replace('items', 'data', 1) b = Box(json.loads(raw)) # parse limitation from uri for next_page lim = 0 if b.next_page: lim = int(b.next_page.split('&')[1].split('=')[1]) return b.next_page, b.total, lim, b.data headers = dict() headers['Authorization'] = token _url = None host = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url)) last_parsed_url = None # so if we had problems last time and parsing was terminated due any error if prs_fpath and os.path.exists(prs_fpath): uri = read_file_rows(prs_fpath).pop() last_parsed_url = f'{host}{uri}' parsed_rows_count = 0 total = 0 curr_url = f'{url}?limit={limit}' curr_limit = limit if last_parsed_url: curr_uri, _, curr_limit, _ = load_and_parse(last_parsed_url) parsed_rows_count = get_file_lines_count(csv_fpath) while curr_url: try: curr_uri, total, curr_limit, raw_data = load_and_parse(curr_url) except Exception as e: raise # sleep(timeout) else: if curr_uri: curr_url = f'{url}?{curr_uri}' else: curr_url = None if prs_fpath: append_file(prs_fpath, curr_uri) data = [dict_to_csvrow(d, struct) for d in raw_data] save_csvrows(csv_fpath, data, quoter="\"") parsed_rows_count += len(raw_data) sleep(timeout) if callb_luigi_status: status = f'Parsing {curr_url}. Total rows: {total}. Parsed rows: {parsed_rows_count}.' percent = math.ceil((parsed_rows_count * 100)/total) callb_luigi_status(status, percent) res = dict(total=total, parsed=parsed_rows_count) return res