예제 #1
0
 def run(self):
     rows = parse(self.input().path,
                  self.struct,
                  skiprows=self.skiptop,
                  usecols=self.usecolumns)
     oked_transform_rows(rows)
     save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
예제 #2
0
 def run(self):
     for target in self.input():
         rows = parse(target.path,
                      self.struct,
                      skiprows=self.skiptop,
                      usecols=self.usecolumns)
         save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
예제 #3
0
    def run(self):
        for i, target in enumerate(self.input()):
            self.set_status_message('Parsing {}'.format(target.path))
            rows = parse(target.path,
                         CompanieRow,
                         skiprows=self.skiptop,
                         sheets=self.sheets)
            save_csvrows(self.output().path, [attr.astuple(r) for r in rows])

            percent = round((i + 1) * 100 / len(self.input()))
            self.set_progress_percentage(percent)
예제 #4
0
    def run(self):
        r = get(self.url)
        soup = bs(encode(r, encoding='utf-8'), 'lxml')
        table = soup.find('table')
        rows = table.find_all('tr')
        csvrows = []
        for tr in rows:
            values = tr.find_all('td')
            rate = values[-2].text
            code = values[-3].text.split('/')[0].strip()
            date = datetime.strftime(datetime.today(), DEFAULT_DATE_FORMAT)
            csvrows.append((code, rate, date))

        save_csvrows(self.output().path, csvrows)
예제 #5
0
def parse_excel_rect_area_to_csv(xl_fpath, csv_fpath, wrapper, sheets=None,
                                 skiptopnum=None, usecols=None):
    """ Save records parsed from excel file to csv """

    # get list of sheets
    xl_df = pd.ExcelFile(xl_fpath)
    xl_sheets = xl_df.sheet_names

    _sheets = sheets

    # by default we parse all the sheets
    if not sheets:
        _sheets = [i for i, _ in enumerate(range(len(xl_sheets)))]

    # init skiptoprows
    # by default we always skip one row from the top
    _skiptopnums = [1 for x in range(len(_sheets))]

    # and by now if skiptopnums is given
    # it will be applied only to first sheet
    if skiptopnum:
        _skiptopnums[0] = skiptopnum

    count = 0

    for i, sh in enumerate(_sheets):

        if sh <= len(xl_sheets) - 1:
            df = pd.read_excel(xl_fpath,
                               sheet_name=xl_sheets[sh],
                               skiprows=_skiptopnums[i],
                               usecols=usecols,
                               index_col=None,
                               dtype=str,
                               header=None)

            # convert Excel's empty cells to empty string
            data = df.replace(np.nan, '', regex=True)
            data.dropna(inplace=True)
            rows = [wrapper(*x) for x in data.values]

            if len(rows) > 0:
                save_csvrows(csv_fpath, [attr.astuple(r) for r in rows])

            count += len(rows)

    return count
예제 #6
0
def parse_report(apikey: str, report_name: str, chunk_size: int,
                 output_fpath: str, parsed_fpath: str, struct,
                 version: str, timeout: int, query=None, progress_callback=None) -> Dict:

    total_rows_count = get_total_rows_for_version(version)
    all_chunks = compute_chunks(total_rows_count, chunk_size)

    parsed_chunks = read_parsed_chunks(parsed_fpath)

    chunks = deque(prepare_chunks(all_chunks, parsed_chunks))
    total_chunks = len(chunks)

    parsed_chunks_count = len(parsed_chunks)
    parsed_rows_count = parsed_chunks_count * chunk_size

    while chunks:
        chunk = chunks.popleft()
        url = build_url_for_data_page(report_name, apikey, version=version, query=query)

        try:
            data = load_data(url, struct)
        except (HTTPError, ConnectionError, Timeout, RetryError, ReadTimeout) as e:
            chunks.append(chunk)
            sleep(timeout)
        else:
            if not data:
                break

            parsed_rows_count += len(data)
            parsed_chunks_count += 1
            save_csvrows(output_fpath, data)
            if parsed_chunks:
                append_file(parsed_fpath, str(chunk))
            sleep(timeout)

            if progress_callback:
                s, p = progress_status_info(report_name, version,
                                            total_chunks, parsed_chunks_count)
                progress_callback(s, p)
예제 #7
0
 def run(self):
     raw = get(self.url)
     dicts = parse_json_from_js(raw, self.pattern)
     rows = [attr.astuple(MrpRow(**_d)) for _d in dicts]
     save_csvrows(self.output().path, rows)
예제 #8
0
def goszakup_rest_parsing(csv_fpath: str, url: str, token: str, timeout: int,
                          limit: int, struct=None, prs_fpath=None, callb_luigi_status=None):
    """ Save rows with data parsed from REST service of goszakup.gov.kz to given csv file.
    Supports retry. Every response(json) contains a url to the next page(uri) for parsing("next_page" key)
    Limitation automatily will appear in uri once we specified it at start,
    like - /v3/rnu?page=next&limit=500&search_after=100827
    So, after fall, next time parsing will be started with uri presented by last one in .prs file.
    """

    def load_and_parse(_url):
        raw = get(_url, headers=headers, timeout=timeout)
        # Box has the method with same name
        raw = raw.replace('items', 'data', 1)
        b = Box(json.loads(raw))
        # parse limitation from uri for next_page
        lim = 0
        if b.next_page:
            lim = int(b.next_page.split('&')[1].split('=')[1])
        return b.next_page, b.total, lim, b.data

    headers = dict()
    headers['Authorization'] = token

    _url = None
    host = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))

    last_parsed_url = None
    # so if we had problems last time and parsing was terminated due any error
    if prs_fpath and os.path.exists(prs_fpath):
        uri = read_file_rows(prs_fpath).pop()
        last_parsed_url = f'{host}{uri}'

    parsed_rows_count = 0
    total = 0
    curr_url = f'{url}?limit={limit}'
    curr_limit = limit
    if last_parsed_url:
        curr_uri, _, curr_limit, _ = load_and_parse(last_parsed_url)
        parsed_rows_count = get_file_lines_count(csv_fpath)

    while curr_url:
        try:
            curr_uri, total, curr_limit, raw_data = load_and_parse(curr_url)
        except Exception as e:
            raise
            # sleep(timeout)
        else:
            if curr_uri:
                curr_url = f'{url}?{curr_uri}'
            else:
                curr_url = None

            if prs_fpath:
                append_file(prs_fpath, curr_uri)

            data = [dict_to_csvrow(d, struct) for d in raw_data]
            save_csvrows(csv_fpath, data, quoter="\"")
            parsed_rows_count += len(raw_data)
            sleep(timeout)

        if callb_luigi_status:
            status = f'Parsing {curr_url}. Total rows: {total}. Parsed rows: {parsed_rows_count}.'
            percent = math.ceil((parsed_rows_count * 100)/total)
            callb_luigi_status(status, percent)

    res = dict(total=total, parsed=parsed_rows_count)

    return res