def insertUrlList( db : couchdb.Database, urlList ): logging.info( "inserting url list..." ) for url in urlList: data = { '_id' : url, 'type' : 'url', 'visited' : False } db.save( data )
class EdgeDataBridge(object): """Edge Bridge""" def __init__(self, config): super(EdgeDataBridge, self).__init__() self.config = config self.api_host = self.config_get('tenders_api_server') self.api_version = self.config_get('tenders_api_version') self.retrievers_params = self.config_get('retrievers_params') self.client = TendersClient(host_url=self.api_host, api_version=self.api_version, key='' ) self.couch_url = urljoin( self.config_get('couch_url'), self.config_get('public_db') ) self.db = Database(self.couch_url, session=Session(retry_delays=range(10))) def config_get(self, name): return self.config.get('main').get(name) def get_teders_list(self): for item in get_tenders(host=self.api_host, version=self.api_version, key='', extra_params={'mode': '_all_'}, retrievers_params=self.retrievers_params): yield (item["id"], item["dateModified"]) def save_tender_in_db(self, tender_id, date_modified): tender_doc = self.db.get(tender_id) if tender_doc: if tender_doc['dateModified'] == date_modified: return tender = self.client.get_tender(tender_id).get('data') if tender: tender['_id'] = tender_id tender['doc_type'] = 'Tender' if tender_doc: tender['_rev'] = tender_doc['_rev'] logger.info('Update tender {} '.format(tender_id)) else: logger.info('Save tender {} '.format(tender_id)) try: self.db.save(tender) except Exception as e: logger.info('Saving tender {} fail with error {}'.format(tender_id, e.message), extra={'MESSAGE_ID': 'edge_bridge_fail_save_in_db'}) else: logger.info('Tender {} not found'.format(tender_id)) def run(self): logger.info('Start Edge Bridge', extra={'MESSAGE_ID': 'edge_bridge_start_bridge'}) logger.info('Start data sync...', extra={'MESSAGE_ID': 'edge_bridge__data_sync'}) for tender_id, date_modified in self.get_teders_list(): self.save_tender_in_db(tender_id, date_modified)
class EdgeDataBridge(object): """Edge Bridge""" def __init__(self, config): super(EdgeDataBridge, self).__init__() self.config = config self.api_host = self.config_get('tenders_api_server') self.api_version = self.config_get('tenders_api_version') self.client = TendersClient(host_url=self.api_host, api_version=self.api_version, key='') self.couch_url = urljoin(self.config_get('couch_url'), self.config_get('public_db')) self.db = Database(self.couch_url, session=Session(retry_delays=range(10))) def config_get(self, name): return self.config.get('main').get(name) def get_teders_list(self): for item in get_tenders(host=self.api_host, version=self.api_version, key='', extra_params={'mode': '_all_'}): yield (item["id"], item["dateModified"]) def save_tender_in_db(self, tender_id, date_modified): tender_doc = self.db.get(tender_id) if tender_doc: if tender_doc['dateModified'] == date_modified: return tender = self.client.get_tender(tender_id).get('data') if tender: tender['_id'] = tender_id tender['doc_type'] = 'Tender' if tender_doc: tender['_rev'] = tender_doc['_rev'] logger.info('Update tender {} '.format(tender_id)) else: logger.info('Save tender {} '.format(tender_id)) try: self.db.save(tender) except Exception as e: logger.info( 'Saving tender {} fail with error {}'.format( tender_id, e.message), extra={'MESSAGE_ID': 'edge_bridge_fail_save_in_db'}) else: logger.info('Tender {} not found'.format(tender_id)) def run(self): logger.info('Start Edge Bridge', extra={'MESSAGE_ID': 'edge_bridge_start_bridge'}) logger.info('Start data sync...', extra={'MESSAGE_ID': 'edge_bridge__data_sync'}) for tender_id, date_modified in self.get_teders_list(): self.save_tender_in_db(tender_id, date_modified)
class CouchdbOutput(OutputModule): def __init__(self, actor_config, couchdb_url, payload=None, selection="data", parallel_streams=1, native_events=False, **kw): OutputModule.__init__(self, actor_config) self.pool.createQueue("inbox") self.registerConsumer(self.consume, "inbox") self.couchdb = Database(couchdb_url) def consume(self, event): if event.isBulk(): bulk_docs = {} for e in extractBulkItems(event): doc = e.get(self.kwargs.selection) doc_id = doc.pop('id', doc.pop('_id', '')) if doc_id: doc['_id'] = doc['id'] = doc_id bulk_docs[doc['id']] = doc for row in self.couchdb.view('_all_docs', keys=list(bulk_docs.keys())).rows: if row.id in bulk_docs: bulk_docs[row.id]['_rev'] = row['value']['rev'] try: responce = self.couchdb.update(list(bulk_docs.values())) for ok, doc_id, rest in responce: if ok: self.logging.info("Saved {}".format(doc_id)) else: self.logging.error( "Error on save bulk. Type {}, message {}, doc {}". format(rest, getattr(rest, 'message', ''), doc_id)) except Exception as e: self.logging.error("Uncaught error {} on save bulk".format( e, )) else: data = event.get(self.kwargs.selection) doc_id = data.get('id', data.get('_id')) if doc_id: data['_id'] = data['id'] = doc_id if doc_id in self.couchdb: rev = self.couchdb.get(id).rev data['_rev'] = rev self.logging.debug( "Update revision in data {} to {}".format(id, rev)) self.couchdb.save(data)
def traverseTree( node, db : couchdb.Database ): sizeOfBook = len(node.find_all('p')) paragraphs = 1 for child in node.find_all('p') : paragraph = "" printProgress( paragraphs, sizeOfBook ) for string in child.stripped_strings: paragraph = paragraph + " " + string for sentence in sentences.splitParagraph( paragraph ): # todo: add to couch db the stuf..... if sentence and sentence != "" : now = datetime.datetime.now() doc = { '_id' : str(uuid.uuid4()), 'type' : 'sentence', 'sentence' : sentence, 'source' : 'foundation', 'date' : now.isoformat() } db.save( doc ) paragraphs = paragraphs + 1 print("")
class AuctionsDataBridge(object): """AuctionsDataBridge""" def __init__(self, config): super(AuctionsDataBridge, self).__init__() self.config = config self.tenders_url = urljoin( self.config_get('tenders_api_server'), '/api/{}/tenders'.format( self.config_get('tenders_api_version') ) ) self.tz = tzlocal() self.couch_url = urljoin( self.config_get('couch_url'), self.config_get('auctions_db') ) self.db = Database(self.couch_url, session=Session(retry_delays=range(10))) self.url = self.tenders_url def config_get(self, name): return self.config.get('main').get(name) def tender_url(self, tender_id): return urljoin(self.tenders_url, 'tenders/{}/auction'.format(tender_id)) def get_teders_list(self, re_planning=False): while True: params = {'offset': self.offset, 'opt_fields': 'status,auctionPeriod', 'mode': '_all_'} request_id = generate_request_id(prefix=b'data-bridge-req-') logger.debug('Start request to {}, params: {}'.format( self.url, params), extra={"JOURNAL_REQUEST_ID": request_id}) response = requests.get(self.url, params=params, headers={'content-type': 'application/json', 'X-Client-Request-ID': request_id}) logger.debug('Request response: {}'.format(response.status_code)) if response.ok: response_json = response.json() if len(response_json['data']) == 0: logger.info("Change offset date to {}".format(response_json['next_page']['offset']), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) self.offset = response_json['next_page']['offset'] break for item in response_json['data']: if 'auctionPeriod' in item \ and 'startDate' in item['auctionPeriod'] \ and 'endDate' not in item['auctionPeriod'] \ and item['status'] == "active.auction": start_date = iso8601.parse_date(item['auctionPeriod']['startDate']) start_date = start_date.astimezone(self.tz) auctions_start_in_date = startDate_view( self.db, key=(mktime(start_date.timetuple()) + start_date.microsecond / 1E6) * 1000 ) if datetime.now(self.tz) > start_date: logger.info("Tender {} start date in past. Skip it for planning".format(item['id']), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) continue if re_planning and item['id'] in self.tenders_ids_list: logger.info("Tender {} already planned while replanning".format(item['id']), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) continue elif not re_planning and [row.id for row in auctions_start_in_date.rows if row.id == item['id']]: logger.info("Tender {} already planned on same date".format(item['id']), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) continue yield item if item['status'] == "cancelled": future_auctions = endDate_view( self.db, startkey=time() * 1000 ) if item["id"] in [i.id for i in future_auctions]: logger.info("Tender {} canceled".format(item["id"]), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) auction_document = self.db[item["id"]] auction_document["current_stage"] = -100 auction_document["endDate"] = datetime.now(self.tz).isoformat() self.db.save(auction_document) logger.info("Change auction {} status to 'canceled'".format(item["id"]), extra={"JOURNAL_REQUEST_ID": request_id, 'MESSAGE_ID': DATA_BRIDGE_PLANNING}) logger.info( "Change offset date to {}".format(response_json['next_page']['offset']), extra={"JOURNAL_REQUEST_ID": request_id, 'MESSAGE_ID': DATA_BRIDGE_PLANNING} ) self.offset = response_json['next_page']['offset'] else: sleep(10) def start_auction_worker(self, tender_item): result = do_until_success( check_output, args=([self.config_get('auction_worker'), 'planning', str(tender_item['id']), self.config_get('auction_worker_config')],), ) logger.info("Auction planning command result: {}".format(result), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING_PROCESS}) def planning_with_couch(self): logger.info('Start Auctions Bridge with feed to couchdb', extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) logger.info('Start data sync...', extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) self.planned_tenders = {} self.last_seq_id = 0 while True: do_until_success(self.handle_continuous_feed) def handle_continuous_feed(self): change = self.db.changes(feed='continuous', filter="auctions/by_startDate", since=self.last_seq_id, include_docs=True) for tender_item in change: if 'id' in tender_item: start_date = tender_item['doc']['stages'][0]['start'] if tender_item['doc'].get("current_stage", "") == -100: continue if tender_item['doc'].get("mode", "") == "test": logger.info('Sciped test auction {}'.format(tender_item['id']), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) continue if tender_item['id'] in self.planned_tenders and \ self.planned_tenders[tender_item['id']] == start_date: logger.debug('Tender {} filtered'.format(tender_item['id'])) continue logger.info('Tender {} selected for planning'.format(tender_item['id']), extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) self.start_auction_worker(tender_item) self.planned_tenders[tender_item['id']] = start_date elif 'last_seq' in tender_item: self.last_seq_id = tender_item['last_seq'] logger.info('Resume data sync...', extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) def run(self): logger.info('Start Auctions Bridge', extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) self.offset = '' logger.info('Start data sync...', extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) while True: for tender_item in self.get_teders_list(): logger.debug('Tender {} selected for planning'.format(tender_item)) self.start_auction_worker(tender_item) sleep(2) logger.info('Sleep...', extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) sleep(100) logger.info('Resume data sync...', extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING}) def run_re_planning(self): self.re_planning = True self.tenders_ids_list = [] self.offset = '' logger.info('Start Auctions Bridge for re-planning...', extra={'MESSAGE_ID': DATA_BRIDGE_RE_PLANNING}) for tender_item in self.get_teders_list(re_planning=True): logger.debug('Tender {} selected for re-planning'.format(tender_item)) self.start_auction_worker(tender_item) self.tenders_ids_list.append(tender_item['id']) sleep(1) logger.info("Re-planning auctions finished", extra={'MESSAGE_ID': DATA_BRIDGE_RE_PLANNING})
from json import load from couchdb import Database db = Database("http://localhost:5985/standards") children = load(open("D10003FC.json")) data = {"_id": "english", "children": children, "title": "english", "description": "english"} db.save(data) children = load(open("D100011F.json")) data = {"_id": "math", "children": children, "title": "math", "description": "math"} db.save(data)
from requests import get from couchdb import Database db = Database("http://localhost:5985/lr-data") split_on = ", supported by" page = 0 url = "http://12.109.40.31/search?terms=grade&page={0}" data = get(url.format(page)).json() while len(data) > 0: for item in data: if item['publisher'] is not None and split_on in item['publisher']: parts = [x.strip() for x in item['publisher'].split(split_on)] if parts[0] == parts[1]: print(parts) doc = db[item['_id']] item['publisher'] = parts[0] doc.update(item) print(db.save(doc)) page += 1 data = get(url.format(page)).json()
class EdgeDataBridge(object): """Edge Bridge""" def __init__(self, config): super(EdgeDataBridge, self).__init__() self.config = config self.api_host = self.config_get('tenders_api_server') self.api_version = self.config_get('tenders_api_version') self.retrievers_params = self.config_get('retrievers_params') try: self.client = TendersClient(host_url=self.api_host, api_version=self.api_version, key='') except MissingSchema: raise DataBridgeConfigError( 'In config dictionary empty or missing \'tenders_api_server\'') except ConnectionError as e: raise e self.couch_url = urljoin(self.config_get('couch_url'), self.config_get('public_db')) self.db = Database(self.couch_url, session=Session(retry_delays=range(10))) try: self.db.info() except ResourceNotFound: error_message = "Database with name '" + self.config_get( 'public_db') + "' doesn\'t exist" raise DataBridgeConfigError(error_message) except error as e: if e.errno == errno.ECONNREFUSED: raise DataBridgeConfigError( "Connection refused: 'couch_url' is invalid in config dictionary" ) except AttributeError as e: raise DataBridgeConfigError( '\'couch_url\' is missed or empty in config dictionary.') except KeyError as e: if e.message == 'db_name': raise DataBridgeConfigError( '\'public_db\' name is missed or empty in config dictionary' ) def config_get(self, name): try: return self.config.get('main').get(name) except AttributeError as e: raise DataBridgeConfigError( 'In config dictionary missed section \'main\'') def get_teders_list(self): for item in get_tenders(host=self.api_host, version=self.api_version, key='', extra_params={'mode': '_all_'}, retrievers_params=self.retrievers_params): yield (item["id"], item["dateModified"]) def save_tender_in_db(self, tender_id, date_modified): tender_doc = self.db.get(tender_id) if tender_doc: if tender_doc['dateModified'] == date_modified: return tender = self.client.get_tender(tender_id).get('data') if tender: tender['_id'] = tender_id tender['doc_type'] = 'Tender' if tender_doc: tender['_rev'] = tender_doc['_rev'] logger.info('Update tender {} '.format(tender_id)) else: logger.info('Save tender {} '.format(tender_id)) try: self.db.save(tender) except Exception as e: logger.info( 'Saving tender {} fail with error {}'.format( tender_id, e.message), extra={'MESSAGE_ID': 'edge_bridge_fail_save_in_db'}) else: logger.info('Tender {} not found'.format(tender_id)) def run(self): logger.info('Start Edge Bridge', extra={'MESSAGE_ID': 'edge_bridge_start_bridge'}) logger.info('Start data sync...', extra={'MESSAGE_ID': 'edge_bridge__data_sync'}) for tender_id, date_modified in self.get_teders_list(): self.save_tender_in_db(tender_id, date_modified)
def setSentenceAsVisited(db: couchdb.Database, sentenceId: str): sentence = db[sentenceId] sentence['procesed'] = True db.save(sentence)
def set_urls_as_not_visited(db: couchdb.Database, not_visited_view): for url in db.iterview(not_visited_view, 100): urlDoc = db[url.id] urlDoc['visited'] = False db.save(urlDoc)