def load2(url, struct, updates_date=None): def check_modified(m_date, c_date): m_date_format = '%Y-%m-%d %H:%M:%S' if datetime.strptime(m_date, m_date_format).date() > c_date: return True return False r = get(url, headers=headers, timeout=5) data = [] # trasform to python object(dict, list) raw = json.loads(r) if isinstance(raw, dict): box_obj = Box(raw) if hasattr(box_obj, 'error'): # raise error if instead of data # we get error dict in response raise ElkRequestError(box_obj.error) # if we parse only updates, after given date if updates_date: for r in raw: m_date = Box(r, default_box=True).modified if m_date and check_modified(m_date, updates_date): data.append(dict_to_csvrow(r, struct)) return data # data = [attr.astuple(struct(**d)) for d in raw] data = [dict_to_csvrow(d, struct) for d in raw] return data
def run(self): client = self.get_client() query = gql(self.query) start_from = None params = { 'from': str(self.start_date), 'to': str(self.end_date), 'limit': self.limit } while True: p = params if start_from: p["after"] = start_from data = client.execute(query, variable_values=p) if data.get('Contract') is None or len(data.get('Contract', [])) == 0: break last_id = data.get('Contract', [])[-1]['id'] start_from = last_id data = [ dict_to_csvrow(d, self.struct) for d in data.get('Contract') ] save_csvrows(self.output().path, data, sep=self.sep, quoter="\"")
def run(self): client = self.get_client() query = gql(self.query) start_from = None params = { 'from': str(self.start_date), 'to': str(self.end_date), 'limit': self.limit } header = tuple(f.name for f in attr.fields(GoszakupCompanyRow)) save_csvrows(self.output().path, [header], sep=self.sep) while True: p = params if start_from: p["after"] = start_from data = client.execute(query, variable_values=p) if data.get('Subjects') is None or len(data.get('Subjects', [])) == 0: break last_id = data.get('Subjects', [])[-1]['pid'] start_from = last_id data = [ dict_to_csvrow(d, self.struct) for d in data.get('Subjects') ] save_csvrows(self.output().path, data, sep=self.sep, quoter="\"")
async def process_id(self, session, idx, semaphore): row = () try: async with self.ratelimit: async with semaphore: d = await self._load(session, idx) row = ';'.join(dict_to_csvrow(d, JuridicalInfo)) except NotSuccessError as e: self.stat['nse'] += 1 # print('--', idx1) append_file(self.fm.parsed_file, idx) else: append_file(self.fm.curr_file, row) append_file(self.fm.parsed_file, idx) return row, idx
def _load(self, bid): request = self.request_template.format(bid, *self._date_range) url = self.url_template.format(self.host, self._token) r = requests.post(url, request, headers=self.headers, verify=False, timeout=self._timeout) status_code = r.status_code if status_code != 200: if status_code == 429: raise KgdTooManyRequests('Kgd limitation exceeded') r.raise_for_status() if r.text: try: d = Box(parse(r.text)).answer # it could be html(from squid proxy for example) # or some other trash # not xml formatted except ExpatError: raise KgdResponseError('Not XML formatted') else: # recently KGD started to send # empty response to limit # count of requests in certain time # earlier they've just sending TooManyRequests exception raise KgdResponseError('Empty response received') # if we get request error # something wrong with our xml-request # all errors described in KGD API docs if 'err' in d: errcode = d.err.errorcode raise KgdRequestError(f'Errorcode {errcode}') # it might be just one payment payments = d.payment if isinstance(d.payment, list) else [d.payment] # enrich each row by bin for p in payments: p.bin = bid return [dict_to_csvrow(p, self.struct) for p in payments]
def run(self): error_timeout = self.timeout * 3 headers = dict() headers['Authorization'] = self.token url = f'{self.url}?limit={self.limit}' host = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url)) # we store parsed blocks of data as uris # in case reruning we parse last uri if os.path.exists(self.parsed_fpath): uri = read_lines(self.parsed_fpath).pop() url = f'{host}{uri}' total = 0 parsed_count = get_file_lines_count(self.output().path) parsed_count = 0 if not parsed_count else parsed_count while url: try: r = get(url, headers=headers, timeout=self.timeout) except Exception: sleep(error_timeout) else: response = Box(json.loads(r)) if response.next_page: url = f'{self.url}?{response.next_page}' append_file(self.parsed_fpath, response.next_page) else: url = None total = response.total raw_items = list(response['items']) # data = dict_to_csvrow(raw_items, self.struct) data = [dict_to_csvrow(d, self.struct) for d in raw_items] save_csvrows(self.output().path, data, quoter="\"") parsed_count += self.limit sleep(self.timeout) self.set_status_message(f'Total: {total}. Parsed: {parsed_count}') self.set_progress_percentage(round((parsed_count * 100) / total)) stat = dict(total=total, parsed=parsed_count) append_file(self.success_fpath, str(stat))
def load3(url, struct): def check_modified(m_date, c_date): m_date_format = '%Y-%m-%d %H:%M:%S' if datetime.strptime(m_date, m_date_format).date() > c_date: return True return False r = get(url, headers=headers, timeout=5) data = [] # trasform to python object(dict, list) raw = json.loads(r) if isinstance(raw, dict): box_obj = Box(raw) if hasattr(box_obj, 'error'): # raise error if instead of data # we get error dict in response raise ElkRequestError(box_obj.error) return [dict_to_csvrow(d, struct) for d in raw]
def load_data(url, struct, column_filter=None): data = [] try: r = load_content(url, headers=headers, timeout=TIMEOUT) # trasform to python object(dict, list) raw = json.loads(r) if isinstance(raw, dict): box_obj = Box(raw) if hasattr(box_obj, 'error'): # raise error if instead of data # we get error dict in response raise HTTPError(box_obj.error) for d in raw: try: # we need all values to be string _d = {k: str(v) for (k, v) in d.items()} # convert values in dict to str # for k, v in d.items(): if column_filter: _d = apply_filter_to_dict(_d, column_filter) data.append(dict_to_csvrow(_d, struct)) except BadDataType as e: pass except (HTTPError, ReadTimeout) as e: raise ExternalSourceError('Could not load {}'.format(url)) except BadDataType: pass return data