def base_preprocessor_delete_single(instance_id=None, **kw): """Create a generic DELETE_SINGLE preprocessor. Accepts a single argument, `instance_id`, which is the primary key of the instance which will be deleted. """ logger.info('`base_preprocessor_delete_single` used for endpoint')
def base_preprocessor_delete_many(search_params=None, **kw): """Create a generic DELETE_MANY preprocessor. Accepts a single argument, `search_params`, which is a dictionary containing the search parameters for the request. """ logger.info('`base_preprocessor_delete_many` used for endpoint')
def base_preprocessor_post(data=None, **kw): """Create a generic POST preprocessor. Accepts a single argument, `data`, which is the dictionary of fields to set on the new instance of the model. """ logger.info('`base_preprocessor_post` used for endpoint')
def base_preprocessor_get_many(search_params=None, **kw): """Create a generic GET_MANY preprocessor. Accepts a single argument, `search_params`, which is a dictionary containing the search parameters for the request. """ logger.info('`base_preprocessor_get_many` responded to request')
def base_preprocessor_get_single(instance_id=None, **kw): """Create a generic GET_SINGLE preprocessor. Accepts a single argument, `instance_id`, the primary key of the instance of the model to get. """ logger.info('`base_preprocessor_get_single` responded to request')
def run(self): logger.info('Starting the heartbeat thread') while not self.stop: self.process._write_command_('PING') self.process.check_alive() time.sleep(5) logger.info('Stopping the heartbeat thread')
def __init__(self, uuid): self.uuid = uuid # + uuid4().hex self.keymanager = RedisKeyManager(uuid) self.buy_orders = list() self.sell_orders = list() self.redis = Redis() logger.info('Initializing orderbook: %s'%self.uuid)
def post(self): '''!Post a json object which includes username and password to create a new user { "username":"******", "password":"******" } :return: status code 201 - create success :return: status code 200 - username and password can't be empty :return: status code 409 - user already exists ''' username = self.args['username'] password = self.args['password'] if username == '' or password == '': return {"message": "username and password can't be empty"}, 400 if Users.query.filter_by(username=username).first() is not None: logger.info('user already exist!') abort(409, 'user already exist!') newuser = Users(username, password) db.session.add(newuser) db.session.commit() logger.info('create user : ' + username) return {"message": "created"}, 201
def harvest(self, **kwargs): # pragma: no cover """Make HTTP requests to the OAI server. :param kwargs: OAI HTTP parameters. :rtype: :class:`sickle.OAIResponse` """ start_time = time() for _ in range(self.max_retries): if self.http_method == 'GET': payload_str = "&".join("%s=%s" % (k,v) for k,v in kwargs.items()) url_without_encoding = u"{}?{}".format(self.endpoint, payload_str) http_response = requests.get(url_without_encoding, **self.request_args) self.http_response_url = http_response.url else: http_response = requests.post(self.endpoint, data=kwargs, **self.request_args) self.http_response_url = http_response.url if http_response.status_code == 503: retry_after = self.RETRY_SECONDS logger.info("HTTP 503! Retrying after %d seconds..." % retry_after) sleep(retry_after) else: logger.info("took {} seconds to call pmh url: {}".format(elapsed(start_time), http_response.url)) http_response.raise_for_status() if self.encoding: http_response.encoding = self.encoding return OAIResponse(http_response, params=kwargs)
def login(): """This function logs a user into the system. Upon a GET request a LoginForm will be shown to the user. Upon a POST request the form will be validated and if valid the users specified password will be hashed and compared to the stored password. Should they be equal the user will be logged in (as such his User object will be stored in the session) and redirected to the default page of the authentication-module. Is this not the case or if the form was invalid in the first place, he will be shown the form again. """ form = LoginForm(request.form) if request.method == 'POST' and form.validate(): user = User.objects(username = form.username.data).first() if user is not None: if user.password == generateHash(form.password.data): session['user'] = user session['currency'] = u"\u20AC" return redirect(session.get('next', url_for('budget.showSummary'))) logger.info('User %s has logged in.' % user.username) flash('The specified username and/or password were incorrect.') return render_template('auth/login.html', form = form)
def delete_user(id): if not g.user.is_admin(): logger.error("%s tried to access /delete-user/%d", g.user.email, id) abort(403) user = User.query.get_or_404(id) if user.is_admin(): flash("Cannot delete the admin") return redirect(url_for('user_list')) form = DeleteUserForm() if form.validate_on_submit(): if request.form['button'] == 'Cancel': return form.redirect(url_for('user_list')) logger.info("%s was deleted", user.email) db.session.delete(user) db.session.commit() flash("User deleted successfully") return redirect(url_for('user_list')) return render_template('admin_delete_user.html', Title = "Delete user", form = form, user = user)
def get(request, response): logger.info("Handling request") import time # time.sleepBAD(10) response.setStatus(202) response.write("Success\n", "text/plain")
def delTag(self, data): for del_tag in data: remove_tag = Tags.query.filter_by(tag_name=del_tag).first() remove_tag.tag_count -= 1 logger.info("remove_tags=%s" % remove_tag.tag_name) if remove_tag is not None: self.tags.remove(remove_tag)
def keep_redirecting(r, publisher): # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers # 10.5762/kais.2016.17.5.316 if ("content-length" in r.headers): # manually follow javascript if that's all that's in the payload file_size = int(r.headers["content-length"]) if file_size < 500: matches = re.findall(ur"<script>location.href='(.*)'</script>", r.content_small(), re.IGNORECASE) if matches: redirect_url = matches[0] if redirect_url.startswith(u"/"): redirect_url = get_link_target(redirect_url, r.url) return redirect_url # 10.1097/00003643-201406001-00238 if publisher and is_same_publisher(publisher, "Ovid Technologies (Wolters Kluwer Health)"): matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(), re.IGNORECASE) if matches: an_number = matches[0] redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(an_number) return redirect_url # handle meta redirects redirect_re = re.compile('<meta[^>]*?url=["\'](.*?)["\']', re.IGNORECASE) redirect_match = redirect_re.findall(r.content_small()) if redirect_match: redirect_path = HTMLParser().unescape(redirect_match[0].strip()) redirect_url = urlparse.urljoin(r.request.url, redirect_path) logger.info(u"redirect_match! redirecting to {}".format(redirect_url)) return redirect_url return None
def register(): if request.method == 'POST': logger.info('Registration POST: %s%s%s'\ % (request.form['email'], \ request.form['name'],\ request.form['tel'])) user = User() user.name = request.form.get('name', None) user.email = request.form.get('email', None) user.tel = request.form.get('tel', None) user.msg = request.form.get('message', None) if not user.is_valid: logger.error('No valid form. Request:%s' % request) return jsonify(False) try: user.save() except: logger.error('Don\'t save in base. Request:%s' % request) return jsonify('Error') logger.info('Register:Done!') send_email(user) return jsonify(True) else: return jsonify(False)
def check_pdf_urls(pdf_urls): for url in pdf_urls: make_transient(url) # free up the connection while doing net IO safe_commit(db) db.engine.dispose() req_pool = get_request_pool() checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1) req_pool.close() req_pool.join() row_dicts = [x.__dict__ for x in checked_pdf_urls] for row_dict in row_dicts: row_dict.pop('_sa_instance_state') db.session.bulk_update_mappings(PdfUrl, row_dicts) start_time = time() commit_success = safe_commit(db) if not commit_success: logger.info(u"COMMIT fail") logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
def save_feedback_response(bound_logger, survey_feedback_response): bound_logger.info("Saving feedback response") survey = survey_feedback_response.get("survey_id") period = survey_feedback_response.get("collection", {}).get("period") invalid = survey_feedback_response.get("invalid") if invalid: survey_feedback_response.pop("invalid") feedback_response = FeedbackResponse(invalid=invalid, data=survey_feedback_response, survey=survey, period=period) try: db.session.add(feedback_response) db.session.commit() except IntegrityError as e: logger.error("Integrity error in database. Rolling back commit", error=e) db.session.rollback() raise e except SQLAlchemyError as e: logger.error("Unable to save response", error=e) db.session.rollback() raise e else: logger.info("Feedback response saved") return invalid
def generateSSLCert(): if not os.path.exists(os.path.join(config.DATA_DIR, 'plexivity.key')) or not os.path.exists(os.path.join(config.DATA_DIR, 'plexivity.crt')): logger.warning("plexivity was started with ssl support but no cert was found, trying to generating cert and key now") try: from OpenSSL import crypto, SSL from socket import gethostname # create a key pair k = crypto.PKey() k.generate_key(crypto.TYPE_RSA, 1024) # create a self-signed cert cert = crypto.X509() cert.get_subject().C = "US" cert.get_subject().ST = "plex land" cert.get_subject().L = "plex land" cert.get_subject().O = "plexivity" cert.get_subject().OU = "plexivity" cert.get_subject().CN = gethostname() cert.set_serial_number(1000) cert.gmtime_adj_notBefore(0) cert.gmtime_adj_notAfter(10*365*24*60*60) cert.set_issuer(cert.get_subject()) cert.set_pubkey(k) cert.sign(k, 'sha1') open(os.path.join(config.DATA_DIR, 'plexivity.crt'), "wt").write(crypto.dump_certificate(crypto.FILETYPE_PEM, cert)) open(os.path.join(config.DATA_DIR, 'plexivity.key'), "wt").write(crypto.dump_privatekey(crypto.FILETYPE_PEM, k)) logger.info("ssl cert and key generated and saved to: %s" % config.DATA_DIR) except: logger.error("unable to generate ssl key and cert")
def get_pdf_url_status(pdf_url): worker = current_process() logger.info(u'{} checking pdf url: {}'.format(worker, pdf_url)) is_pdf = False http_status = None try: response = http_get( url=pdf_url.url, ask_slowly=True, stream=True, publisher=pdf_url.publisher, session_id=get_session_id() ) except Exception as e: logger.error(u"{} failed to get response: {}".format(worker, e.message)) else: with response: try: is_pdf = is_a_pdf_page(response, pdf_url.publisher) http_status = response.status_code except Exception as e: logger.error(u"{} failed reading response: {}".format(worker, e.message)) pdf_url.is_pdf = is_pdf pdf_url.http_status = http_status pdf_url.last_checked = datetime.utcnow() logger.info(u'{} updated pdf url: {}'.format(worker, pdf_url)) return pdf_url
def get_multiple_pubs_response(): is_person_who_is_making_too_many_requests = False biblios = [] body = request.json if "dois" in body: if len(body["dois"]) > 25: abort_json(413, "max number of DOIs is 25") if len(body["dois"]) > 1: is_person_who_is_making_too_many_requests = True for doi in body["dois"]: biblios += [{"doi": doi}] if u"jama" in doi: is_person_who_is_making_too_many_requests = True elif "biblios" in body: for biblio in body["biblios"]: biblios += [biblio] if len(body["biblios"]) > 1: is_person_who_is_making_too_many_requests = True logger.info(u"in get_multiple_pubs_response with {}".format(biblios)) run_with_hybrid = g.hybrid if is_person_who_is_making_too_many_requests: logger.info(u"is_person_who_is_making_too_many_requests, so returning 429") abort_json(429, u"sorry, you are calling us too quickly. Please email [email protected] so we can figure out a good way to get you the data you are looking for.") pubs = pub.get_pubs_from_biblio(biblios, run_with_hybrid) return pubs
def startScheduler(): db.create_all() #create default roles! if not db.session.query(models.Role).filter(models.Role.name == "admin").first(): admin_role = models.Role(name='admin', description='Administrator Role') user_role = models.Role(name='user', description='User Role') db.session.add(admin_role) db.session.add(user_role) db.session.commit() try: import tzlocal tz = tzlocal.get_localzone() logger.info("local timezone: %s" % tz) except: tz = None if not tz or tz.zone == "local": logger.error('Local timezone name could not be determined. Scheduler will display times in UTC for any log' 'messages. To resolve this set up /etc/timezone with correct time zone name.') tz = pytz.utc #in debug mode this is executed twice :( #DONT run flask in auto reload mode when testing this! scheduler = BackgroundScheduler(logger=sched_logger, timezone=tz) scheduler.add_job(notify.task, 'interval', seconds=config.SCAN_INTERVAL, max_instances=1, start_date=datetime.datetime.now(tz) + datetime.timedelta(seconds=2)) scheduler.start() sched = scheduler
def crop_postprocessor_update_single(result=None, **kw): """Create an Crop specific PATCH_SINGLE and PUT_SINGLE postprocessor. Accepts a single argument, `result`, which is the dictionary representation of the requested instance of the model. """ logger.info('`crop_postprocessor_update_single` used for endpoint')
def crop_postprocessor_get_single(result=None, **kw): """Create an Crop specific GET_SINGLE postprocessor. Accepts a single argument, `result`, which is the dictionary representation of the requested instance of the model. """ logger.info('`crop_postprocessor_get_single` responded to request')
def crop_postprocessor_post(result=None, **kw): """Create an Crop specific POST postprocessor. Accepts a single argument, `result`, which is the dictionary representation of the created instance of the model. """ logger.info('`crop_postprocessor_post` used for endpoint')
def crop_postprocessor_delete_single(was_deleted=None, **kw): """Create an Crop specific DELETE_SINGLE postprocessor. Accepts a single argument, `was_deleted`, which represents whether the instance has been deleted. """ logger.info('`crop_postprocessor_delete_single` used for endpoint')
def get_statistics(): """Gathers all user's statistics. It gathers all statistics about user activity(qantity of memorized words, quantity of passed test, average grade and quantity of passed tests per week). :Route: '/api/user/statistic'. :Methods: GET. :Returns: json object with all user's statistics. """ uid = flask_login.current_user.uid tests = db.get_user_tests(uid) tests_taken = len(tests) logger.info(tests) average_grade = reduce(lambda x, y: x + y, [item[0] for item in tests]) / tests_taken words = db.get_words(uid, True) words_memorized = len(words) tests_per_week = utils.get_tests_count_per_week(tests) result = {'tests_taken': tests_taken, 'average_grade': average_grade, 'words_memorized': words_memorized, 'tests_per_week': tests_per_week} return Response(json.dumps(result), mimetype='application/json', status=200)
def login(): import hashlib if request.method == 'GET': return render_template("login.html") email = request.form["email"] password = request.form["password"] m = hashlib.md5() m.update(password) userinfo = User.query.filter(User.email == email).first() if userinfo is None: msg = '用户不存在, 请先注册' logger.info(msg) return render_template("login.html", email = email, msg = msg) userinfo = User.query.filter(and_(User.email == email, User.password == m.hexdigest())).first() if userinfo is None: msg = '用户名密码不正确' password = '' return render_template("login.html", email = email, password = password, msg = msg) else: """do nothing""" print 'login successful' response = make_response(redirect('/')) secure_token = create_token(userinfo.id, userinfo.email, request.user_agent) response.set_cookie('secure_token', value=secure_token, max_age=2592000) login_user(userinfo) return response
def is_a_pdf_page(response, page_publisher): if is_pdf_from_header(response): if DEBUG_SCRAPING: logger.info(u"http header says this is a PDF {}".format( response.request.url) ) return True # everything below here needs to look at the content # so bail here if the page is too big if is_response_too_large(response): if DEBUG_SCRAPING: logger.info(u"response is too big for more checks in is_a_pdf_page") return False content = response.content_big() # PDFs start with this character if re.match(u"%PDF", content): return True if page_publisher: says_free_publisher_patterns = [ ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'), ("Wiley-Blackwell", u'<iframe id="pdfDocument"'), ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'), ("IOP Publishing", ur'Full Refereed Journal Article') ] for (publisher, pattern) in says_free_publisher_patterns: matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL) if is_same_publisher(page_publisher, publisher) and matches: return True return False
def add_dois_to_queue_from_query(where, job_type): logger.info(u"adding all dois, this may take a while") start = time() table_name = "doi_queue" # run_sql(db, "drop table {} cascade".format(table_name(job_type))) # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format( # table_name(job_type)) create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format( table_name) if where: create_table_command = create_table_command.replace("from pub)", "from pub where {})".format(where)) run_sql(db, create_table_command) create_table_command += """ alter table {table_name} alter column rand set default random(); CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id); CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null; CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null; -- from https://lob.com/blog/supercharge-your-postgresql-performance -- vacuums and analyzes every ten million rows ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000); ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000); """.format( table_name=table_name) for command in create_table_command.split(";"): run_sql(db, command) command = """create or replace view export_queue as SELECT id AS doi, updated AS updated, response_jsonb->>'evidence' AS evidence, response_jsonb->>'oa_status' AS oa_color, response_jsonb->>'free_fulltext_url' AS best_open_url, response_jsonb->>'year' AS year, response_jsonb->>'found_hybrid' AS found_hybrid, response_jsonb->>'found_green' AS found_green, response_jsonb->>'error' AS error, response_jsonb->>'is_boai_license' AS is_boai_license, replace(api->'_source'->>'journal', ' ', '') AS journal, replace(api->'_source'->>'publisher', ' ', '') AS publisher, api->'_source'->>'title' AS title, api->'_source'->>'subject' AS subject, response_jsonb->>'license' AS license FROM pub where id in (select id from {table_name})""".format( table_name=table_name(job_type)) # if job_type: # command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid") run_sql(db, command) # they are already lowercased logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(elapsed(start, 1))) print_status(job_type)
def post(self): '''!Receive consent receipt :return: status code 409 - already exists :return: status code 201 - created ''' logger.info(json.loads(request.get_data())) for item in self.receipt: self.receipt[item] = request.json.get(item) receipt = Receipts.query.filter_by( consent_receipt_id=self.receipt['consentReceipt']['consent_receipt_id']).first() # if receipt exist updated it if receipt is not None: logger.debug('Receipt already exist!') # TODO still return 409 after demo # abort(409, 'Receipt already exist!') #=====start===== receipt.rpt = self.receipt['rpt'] receipt.rs_id = self.receipt['consentReceipt']['rs_id'], receipt.consent_receipt_id = self.receipt['consentReceipt']['consent_receipt_id'] receipt.service_contract_id = self.receipt['consentReceipt']['service_contract_id'] receipt.authorization_status = self.receipt['consentReceipt']['authorization_status'] receipt.data_usage_license = self.receipt['consentReceipt']['data_usage_license'] receipt.consent_summary = json.dumps(self.receipt['consentReceipt']['consent_summary']) receipt.update_time = datetime.datetime.now() db.session.add(receipt) db.session.commit() return {'message': 'updated'}, 201 #======end===== for item in self.receipt['consentReceipt']: if item is None: logger.debug(item + 'can not be none!') abort(409, item + 'can not be none!') # receipt[item] = self.receipt[item] receipt = Receipts( self.receipt['rpt'], self.receipt['consentReceipt']['rs_id'], str(self.receipt['consentReceipt']['consent_receipt_id']), str(self.receipt['consentReceipt']['service_contract_id']), self.receipt['consentReceipt']['authorization_status'], str(self.receipt['consentReceipt']['data_usage_license']), json.dumps(self.receipt['consentReceipt']['consent_summary'])) mapping = Mappings( self.receipt['consentReceipt']['account_id'], str(self.receipt['consentReceipt']['consent_receipt_id']), datetime.datetime.now() ) db.session.add(mapping) db.session.add(receipt) db.session.commit() return {'message': 'created'}, 201
def export_with_versions(do_all=False, job_type="normal", filename=None, view=None, week=False, json=False): # ssh -i /Users/hpiwowar/Dropbox/ti/certificates/aws-data-export.pem [email protected] # aws s3 cp test.txt s3://mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/test.txt # connect to our bucket (conn, ssh_client) = login_to_aws() # to connect to clarivate's bucket # clarivate_conn = boto.ec2.connect_to_region('us-east-2') # clarivate_instance = clarivate_conn.get_all_instances()[0].instances[0] # clarivate_ssh_client = sshclient_from_instance(clarivate_instance, "/Users/hpiwowar/Dropbox/ti/certificates/aws-data-export.pem", user_name="ec2-user") logger.info(u"log in done") now_timestamp = datetime.datetime.utcnow().isoformat()[0:19].replace( ":", "") if not filename: filename = "all_dois_{}.csv".format(now_timestamp) today = datetime.datetime.utcnow() if week: last_week = today - datetime.timedelta(days=9) view = "export_main_changed_with_versions where last_changed_date >= '{}'::timestamp and updated > '1043-01-01'::timestamp".format( last_week.isoformat()[0:19]) filename = "changed_dois_with_versions_{}_to_{}.csv".format( last_week.isoformat()[0:19], today.isoformat()[0:19]).replace(":", "") else: filename = "dois_with_versions_{}.csv".format( today.isoformat()[0:19]).replace(":", "") if not view: view = "export_main_changed_with_versions" command = """psql {}?ssl=true -c "\copy (select * from {}) to '{}' WITH (FORMAT CSV, HEADER);" """.format( os.getenv("DATABASE_URL"), view, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) command = """gzip -c {} > {}.gz;""".format(filename, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) command = """date -r {}.gz;""".format(filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) gz_modified = stdout.strip() # command = """aws s3 cp {}.gz s3://mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/{}.gz --acl public-read --metadata "modifiedtimestamp='{}'";""".format( # filename, filename, gz_modified) command = """aws s3 cp {}.gz s3://oadoi-for-clarivate/{}.gz --acl public-read --metadata "modifiedtimestamp='{}'";""".format( filename, filename, gz_modified) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) # also make a .DONE file # how to calculate a checksum http://www.heatware.net/linux-unix/how-to-create-md5-checksums-and-validate-a-file-in-linux/ command = """md5sum {}.gz > {}.gz.DONE;""".format(filename, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) command = """date -r {}.gz;""".format(filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) gz_done_modified = stdout.strip() # copy up the .DONE file # command = """aws s3 cp {}.gz.DONE s3://mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/{}.gz.DONE --acl public-read --metadata "modifiedtimestamp='{}'";""".format( # filename, filename, gz_done_modified) command = """aws s3 cp {}.gz.DONE s3://oadoi-for-clarivate/{}.gz.DONE --acl public-read --metadata "modifiedtimestamp='{}'";""".format( filename, filename, gz_done_modified) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) # logger.info(u"now go to *** https://console.aws.amazon.com/s3/object/mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/{}.gz?region=us-east-1&tab=overview ***".format( # filename)) logger.info( u"public link is at *** https://s3-us-west-2.amazonaws.com/oadoi-for-clarivate/{}.gz ***" .format(filename)) conn.close()
def export_no_versions(do_all=False, job_type="normal", filename=None, view="export_main_no_versions", week=False, json=False): (conn, ssh_client) = login_to_aws() logger.info(u"log in done") today = datetime.datetime.utcnow() if week: last_week = today - datetime.timedelta(days=9) if json: view = "pub where last_changed_date >= '{}'::timestamp and updated > '1043-01-01'::timestamp".format( last_week.isoformat()[0:19]) filename = "changed_dois_{}_to_{}.jsonl".format( last_week.isoformat()[0:19], today.isoformat()[0:19]).replace(":", "") else: view = "export_main_changed_no_versions where last_changed_date >= '{}'::timestamp and updated > '1043-01-01'::timestamp".format( last_week.isoformat()[0:19]) filename = "changed_dois_{}_to_{}.csv".format( last_week.isoformat()[0:19], today.isoformat()[0:19]).replace(":", "") else: if json: filename = "full_dois_{}.jsonl".format( today.isoformat()[0:19]).replace(":", "") else: filename = "full_dois_{}.csv".format( today.isoformat()[0:19]).replace(":", "") if json: command = """psql {}?ssl=true -c "\copy (select response_jsonb from {}) to '{}';" """.format( os.getenv("DATABASE_URL"), view, filename) else: command = """psql {}?ssl=true -c "\copy (select * from {}) to '{}' WITH (FORMAT CSV, HEADER);" """.format( os.getenv("DATABASE_URL"), view, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) if json: command = """sed -i 's/"publishedVersion"/null/g; s/"submittedVersion"/null/g; s/"acceptedVersion"/null/g' {}""".format( filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) command = """gzip -c {} > {}.gz; date;""".format(filename, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) command = """aws s3 cp {}.gz s3://unpaywall-data-updates/{}.gz --acl public-read; date; """.format( filename, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) # also make a .DONE file # how to calculate a checksum http://www.heatware.net/linux-unix/how-to-create-md5-checksums-and-validate-a-file-in-linux/ command = """md5sum {}.gz > {}.gz.DONE; date;""".format(filename, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) # copy up the .DONE file command = """aws s3 cp {}.gz.DONE s3://unpaywall-data-updates/{}.gz.DONE --acl public-read; date;""".format( filename, filename) logger.info(command) status, stdout, stderr = ssh_client.run(command) logger.info(u"{} {} {}".format(status, stdout, stderr)) logger.info( u"now go to *** https://console.aws.amazon.com/s3/object/unpaywall-data-updates/{}.gz?region=us-east-1&tab=overview ***" .format(filename)) logger.info( u"public link is at *** https://s3-us-west-2.amazonaws.com/unpaywall-data-updates/{}.gz ***" .format(filename)) conn.close()
def on_connect(client, userdata, flag,rc): print("Connected with result code "+str(rc)) logger.info("Connected with result code "+str(rc)) client.subsrcibe("Modbus\Received") Mqtt_Stat.value = rc
def subscribe(self): new_client = SSEClient(self) self.clients.append(new_client) logger.info('new client subscribed: {}'.format(new_client)) return new_client
def print_ip(): user_agent = request.headers.get('User-Agent') logger.info(u"calling from IP {ip}. User-Agent is '{user_agent}'.".format( ip=get_ip(), user_agent=user_agent))
def fetch_queue_chunk(self, chunk_size, scrape_publisher): logger.info(u"looking for new jobs") if scrape_publisher: pmh_value_filter = "and pmh_id = '{}'".format(publisher_equivalent_pmh_id) else: pmh_value_filter = "and pmh_id is distinct from '{}'".format(publisher_equivalent_pmh_id) text_query_pattern = """ with update_chunk as ( select lru_by_endpoint.id from endpoint e cross join lateral ( select qt.* from {queue_table} qt join page_new p using (id) where qt.endpoint_id = e.id and qt.started is null and (qt.finished is null or qt.finished < now() - '1 day'::interval) and qt.endpoint_id is distinct from '{biorxiv_id}' {pmh_value_filter} order by qt.finished asc nulls first limit 1 for update of qt skip locked ) lru_by_endpoint order by finished asc nulls first, rand limit {chunk_size} ) update {queue_table} queue_rows_to_update set started=now() from update_chunk where update_chunk.id = queue_rows_to_update.id returning update_chunk.id; """ text_query = text_query_pattern.format( chunk_size=chunk_size, queue_table=self.table_name(None), pmh_value_filter=pmh_value_filter, biorxiv_id=biorxiv_endpoint_id ) logger.info(u"the queue query is:\n{}".format(text_query)) job_time = time() row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall() object_ids = [row[0] for row in row_list] logger.info(u"got {} ids, took {} seconds".format(len(object_ids), elapsed(job_time))) job_time = time() q = db.session.query(PageNew).options( orm.undefer('*') ).filter(PageNew.id.in_(object_ids)) objects = q.all() logger.info(u"got page_new objects in {} seconds".format(elapsed(job_time))) return objects
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = { "Accept": "application/json", "User-Agent": "mailto:[email protected]" } if first: base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" else: base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" next_cursor = "*" has_more_responses = True dois_from_api = [] number_added = 0 start_time = time() while has_more_responses: has_more_responses = False start_time = time() url = base_url.format(first=first, last=last, rows=chunk_size, next_cursor=next_cursor) logger.info(u"calling url: {}".format(url)) resp = requests.get(url, headers=headers) logger.info( u"getting crossref response took {} seconds. url: {}".format( elapsed(start_time, 2), url)) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format( resp.status_code)) return number_added resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if resp_data["items"] and len(resp_data["items"]) == chunk_size: has_more_responses = True dois_from_api = [ clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"] ] added_pubs = add_new_pubs_from_dois(dois_from_api) if dois_from_api: logger.info(u"got {} dois from api".format(len(dois_from_api))) if added_pubs: logger.info(u"{}: saved {} new pubs, including {}".format( first, len(added_pubs), added_pubs[-2:])) number_added += len(added_pubs) logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2))) return number_added
async def connect(self): logger.info("before connect") self.fast_mqtt.init_app(self.app) logger.info("after connect")
def publish(self, topic, message): logger.info(f' publishing to {topic}. message: {message}') return self.fast_mqtt.publish(topic, message)
def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = self.default_version() is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself version_is_from_strict_metadata = self.pmh_record and self.pmh_record.api_raw and re.compile( ur"<dc:type>{}</dc:type>".format(self.scrape_version), re.IGNORECASE | re.MULTILINE | re.DOTALL ).findall(self.pmh_record.api_raw) if version_is_from_strict_metadata or not r: logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version != "publishedVersion" and not version_is_from_strict_metadata: patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is brought to you for free and open access by Works.", re.IGNORECASE | re.MULTILINE | re.DOTALL), ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern)) self.scrape_version = "publishedVersion" if text and self.scrape_version != 'acceptedVersion': patterns = [ re.compile(ur'This is a post-peer-review, pre-copyedit version', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'This is the peer reviewed version of the following article', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'The present manuscript as of \d\d \w+ \d\d\d\d has been accepted', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'Post-peer-review, pre-copyedit version of accepted manuscript', re.IGNORECASE | re.MULTILINE | re.DOTALL), ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is accepted version'.format(pattern.pattern)) self.scrape_version = "acceptedVersion" if r and r.url and '61RMIT_INST' in r.url: if 'Version: Accepted' in text: logger.info(u'found Version: Accepted, decided PDF is accepted version') self.scrape_version = "acceptedVersion" heading_text = text[0:50].lower() accepted_headings = [ "final accepted version", "accepted manuscript", ] for heading in accepted_headings: if heading in heading_text: logger.info(u'found {} in heading, decided PDF is accepted version'.format(heading)) self.scrape_version = "acceptedVersion" break if not self.scrape_license: open_license = find_normalized_license(text) if open_license: logger.info(u'found license in PDF: {}'.format(open_license)) self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) if self.pmh_record: self.scrape_version = _scrape_version_override().get(self.pmh_record.bare_pmh_id, self.scrape_version) logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
def print_status(self, job_type): num_dois = self.number_total_on_queue(job_type) num_waiting = self.number_waiting_on_queue(job_type) if num_dois: logger.info(u"There are {} dois in the queue, of which {} ({}%) are waiting to run".format( num_dois, num_waiting, int(100*float(num_waiting)/num_dois)))
def monitor_till_done(self, job_type): logger.info(u"collecting data. will have some stats soon...") logger.info(u"\n\n") num_total = self.number_total_on_queue(job_type) print "num_total", num_total num_unfinished = self.number_unfinished(job_type) print "num_unfinished", num_unfinished loop_thresholds = {"short": 30, "long": 10 * 60, "medium": 60} loop_unfinished = {"short": num_unfinished, "long": num_unfinished} loop_start_time = {"short": time(), "long": time()} # print_idle_dynos(job_type) while all(loop_unfinished.values()): for loop in ["short", "long"]: if elapsed(loop_start_time[loop]) > loop_thresholds[loop]: if loop in ["short", "long"]: num_unfinished_now = self.number_unfinished(job_type) num_finished_this_loop = loop_unfinished[ loop] - num_unfinished_now loop_unfinished[loop] = num_unfinished_now if loop == "long": logger.info(u"\n****"), logger.info( u" {} finished in the last {} seconds, {} of {} are now finished ({}%). " .format( num_finished_this_loop, loop_thresholds[loop], num_total - num_unfinished_now, num_total, int(100 * float(num_total - num_unfinished_now) / num_total)) ), # comma so the next part will stay on the same line if num_finished_this_loop: minutes_left = float( num_unfinished_now ) / num_finished_this_loop * loop_thresholds[ loop] / 60 logger.info( u"{} estimate: done in {} mins, which is {} hours" .format(loop, round(minutes_left, 1), round(minutes_left / 60, 1))) else: print loop_start_time[loop] = time() # print_idle_dynos(job_type) print ".", sleep(3) logger.info(u"everything is done. turning off all the dynos") self.scale_dyno(0, job_type)
scale_dyno(0, job_type) truncate(job_type) add_dois_to_queue_from_file(parsed_args.filename, job_type) if parsed_args.addall or parsed_args.where: if num_dynos(job_type) > 0: scale_dyno(0, job_type) add_dois_to_queue_from_query(parsed_args.where, job_type) if parsed_args.soup: if num_dynos(job_type) > 0: scale_dyno(0, job_type) if parsed_args.dynos: scale_dyno(parsed_args.dynos, job_type) else: logger.info(u"no number of dynos specified, so setting 1") scale_dyno(1, job_type) monitor_till_done(job_type) scale_dyno(0, job_type) export_with_versions(parsed_args.all, job_type, parsed_args.filename, parsed_args.view) else: if parsed_args.dynos != None: # to tell the difference from setting to 0 scale_dyno(parsed_args.dynos, job_type) # if parsed_args.dynos > 0: # print_logs(job_type) if parsed_args.reset: reset_enqueued(job_type) if parsed_args.status:
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: with closing( http_get(url, stream=True, related_pub=self.related_pub, ask_slowly=self.ask_slowly)) as self.r: if self.r.status_code != 200: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks". format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if u"citeseerx.ist.psu.edu/" in url: matches = re.findall( u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: self.scraped_pdf_url = unicode(matches[0], "utf-8") self.scraped_open_metadata_url = url return pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info( u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): i = 0 records_to_save = [] # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = { "Accept": "application/json", "User-Agent": "mailto:[email protected]" } root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}" # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() first = (datetime.date.today() - datetime.timedelta(days=7)).isoformat() elif today: last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() first = (datetime.date.today() - datetime.timedelta(days=2)).isoformat() if not first: first = "2016-04-01" start_time = time() while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first, last=last, next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first, next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format( elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format( resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = add_new_pubs(pubs_this_chunk) logger.info( u"added {} pubs, loop done in {} seconds".format( len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) # if new_pubs: # id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]] # logger.info(u"last few ids were {}".format(id_links)) pubs_this_chunk = [] loop_time = time() logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = add_new_pubs(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info( u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def gets_a_pdf(self, link, base_url): if is_purchase_link(link): return False absolute_url = get_link_target(link.href, base_url) if DEBUG_SCRAPING: logger.info(u"checking to see if {} is a pdf".format(absolute_url)) start = time() try: with closing( http_get(absolute_url, stream=True, related_pub=self.related_pub, ask_slowly=self.ask_slowly)) as self.r: if self.r.status_code != 200: self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format( self.r.status_code, absolute_url) return False if self.is_a_pdf_page(): return True except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format( absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.Timeout as e: self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format( absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format( absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in gets_a_pdf for {}: {}".format( absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format( absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except NoDoiException as e: self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format( absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) if DEBUG_SCRAPING: logger.info( u"we've decided this ain't a PDF. took {} seconds [{}]".format( elapsed(start), absolute_url)) return False
type=str, help="last filename to process (example: --last 2006-01-01)") parser.add_argument('--query_doi', nargs="?", type=str, help="pull in one doi") parser.add_argument( '--today', action="store_true", default=False, help="use if you want to pull in crossref records from last 2 days") parser.add_argument( '--week', action="store_true", default=False, help="use if you want to pull in crossref records from last 7 days") parser.add_argument('--chunk_size', nargs="?", type=int, default=1000, help="how many docs to put in each POST request") parsed = parser.parse_args() logger.info(u"calling {} with these args: {}".format( function.__name__, vars(parsed))) function(**vars(parsed))
def Mqtt_process(Stat, MqConn,MqStatChild , MqDataChild,): mq = mqtt_parameters.query.get(1) try: client = mqtt.Client(client_id = "Proj_%s" %(random.getrandbits(8))) client.on_connect = on_connect # Mqtt_Stat = client._handle_connack() if mq.mqtt_user_name and mq.mqtt_password : client.username_pw_set(mq.mqtt_user_name,mq.mqtt_password) elif mq.mqtt_access_token: client.username_pw_set(mq.mqtt_access_token) client.connect(mq.mqtt_ip,mq.mqtt_port,60) MqStatChild.send("Setted Client parameters") logger.info("Setted Client parameters") client.loop_start() MqStatChild.send("Loop Started & Connected to Server") logger.info("Loop Started & Connected to Server") Mqtt_Stat = Stat.value while (Mqtt_Stat > 0): time.sleep(1) if Mqtt_Stat == 0: pass elif Mqtt_Stat == 1: #---Connection refused - incorrect protocol version ---# client.loop_stop() MqStatChild.send("Connection refused - invalid client identifier") logger.error("Connection refused - invalid client identifier") elif Mqtt_Stat == 2 : #---Connection refused - invalid client identifier---# MqStatChild.send("Connection refused - invalid client identifier") logger.error("Connection refused - invalid client identifier") client.loop_stop() time.sleep(1) client = mqtt.Client(client_id = "Proj_%s" %random.getrandbits(8)) MqStatChild.send("Changed another Client identifier") logger.info("Changed another Client identifier") client.loop_start() MqStatChild.send("Loop Started") logger.info("Loop Started") elif Mqtt_Stat == 3: #-- Connection refused - server unavailable ---# client.loop_stop() MqStatChild.send("Connection Unaviable Checck Internet") logger.error("Connection Unaviable Checck Internet") elif Mqtt_Stat == 4: #---Connection refused - bad username or password---# client.loop_stop() MqStatChild.send(" Connection refused - bad username or password") logger.error(" Connection refused - bad username or password") elif Mqtt_Stat == 5 : #---Connection refused - not authorised---# client.loop_stop() MqStatChild.send("Connection refused - not authorised") logger.error("Connection refused - not authorised") else : MqStatChild.send("Waiting for Connection or Not Connected or -->Mqtt_Stat - %s" %Mqtt_Stat) logger.info("Waiting for Connection or Not Connected or -->Mqtt_Stat - %s" %Mqtt_Stat) while True: if MqConn.poll(): msg = MqConn.recv() client.publish(msg[0].topic, payload= msg[1], qos=msg[0].qos, retain=msg[0].retain) # client.publish(msg["topic"], msg["value"]) MqDataChild.send(msg) except Exception as e : client.loop_stop() print("Mqtt error - {}".format(e)) MqStatChild.send("Mqtt Disconnected, mqtt Process Stopped") MqStatChild.send(str(e)) logger.exception("Got Exception")
def worker_run(self, **kwargs): run_class = PageNew single_id = kwargs.get("id", None) chunk_size = kwargs.get("chunk", 100) limit = kwargs.get("limit", None) scrape_publisher = kwargs.get("scrape_publisher", False) if limit is None: limit = float("inf") if single_id: page = run_class.query.filter(run_class.id == single_id).first() page.scrape() db.session.merge(page) safe_commit(db) or logger.info(u"COMMIT fail") else: index = 0 num_updated = 0 start_time = time() while num_updated < limit: new_loop_start_time = time() objects = self.fetch_queue_chunk(chunk_size, scrape_publisher) if not objects: sleep(5) continue scraped_ids = scrape_pages(objects) unscraped_ids = [obj.id for obj in objects if obj.id not in scraped_ids] logger.info(u'scraped {} pages and returned {} to the queue'.format( len(scraped_ids), len(unscraped_ids) )) scraped_batch_text = u''' update {queue_table} set finished = now(), started=null where id = any(:ids)'''.format(queue_table=self.table_name(None)) unscraped_batch_text = u''' update {queue_table} set started=null where id = any(:ids)'''.format(queue_table=self.table_name(None)) scraped_batch_command = text(scraped_batch_text).bindparams( ids=scraped_ids) unscraped_batch_command = text(unscraped_batch_text).bindparams( ids=unscraped_ids) db.session.execute(scraped_batch_command) db.session.execute(unscraped_batch_command) commit_start_time = time() safe_commit(db) or logger.info(u"COMMIT fail") logger.info(u"commit took {} seconds".format(elapsed(commit_start_time, 2))) index += 1 num_updated += chunk_size self.print_update(new_loop_start_time, len(scraped_ids), limit, start_time, index)
def Mod_ReadWrite(ModConn, ModStatChild): mod = modbus_parameters.query.get(1) PubTopics = pub_mqtt_topics.query.filter(pub_mqtt_topics.mod_addresses.any(read_mod_registers.address >= 0)).all() try: while True: if is_connected(mod.modbus_ip,mod.modbus_port): ModStatChild.send("Modbus device Connection is UP") logger.info("Modbus device Connection is UP") break else: ModStatChild.send("Modbus device connection is DOWN") logger.error("Modbus device connection is DOWN") time.sleep(10) Modclient = ModbusClient(mod.modbus_ip, port = mod.modbus_port) msg = 0 while True : if ModStatChild.poll(): msg = ModStatChild.recv() print("received Msg in modbus outer while loop {}".format(msg)) logger.info("received Msg in modbus outer while loop {}".format(msg)) if msg == 1: Modclient.connect() ModStatChild.send("Connected to Modbus device") logger.info("Connected to Modbus device") msg = 2 while msg == 2: # GetModValues = ModReadJson(Modclient, 0 , 10) for topic in PubTopics: GetModValues = ModReadTopic(Modclient,topic) ######################### ModConn.send(GetModValues) ######################### # print(GetModValues) time.sleep(0.5) if ModStatChild.poll(): msg = ModStatChild.recv() if msg == 1 : print("received Msg in modbus inner while loop {}".format(msg)) logger.info("received Msg in modbus inner while loop {}".format(msg)) msg = 2 ModStatChild.send("Modbus device data Acquisition already running") while msg == 3 : Modclient.close() msg = 0 ModStatChild.send("Modbus device connection Closed") # FlModChild.send("Disconnected from Controller") # FlModChild.send(GetModValues) # if ModConn.poll(): # msg = ModConn.recv() # if isinstance(msg, dict): # if "ModWrite" in msg: # if msg["ModWrite"] == True: # ModWriteJson(ModbusClient,msg) except Exception as e: ModStatChild.send("Modbus Disconnected, Modbus process Stopped") ModStatChild.send(str(e)) print(e) ############------------------------------------------------------################
def stop(self): self.stopped = True self.status = 'stopping' if hasattr(self.job, 'stop'): self.status = self.job.stop() logger.info('sequence job stopped: status={}'.format(self.status))
def unsubscribe(self, client): logger.info('unsubscribing client: {} (clients: {})'.format( client, len(self.clients))) self.clients = [x for x in self.clients if x != client] logger.debug('clients now: {}'.format(len(self.clients)))
def get_geojson_carto_gs(self, filepath: tuple = (), opts: dict = {}) -> dict: # [Step - 1] Get image + check color sampling img_path = os.path.join(BASE_DIR, filepath[0]) img_tiff_path = os.path.join(BASE_DIR, filepath[1]) img_extension = os.path.splitext(img_path)[1] img_name = ntpath.basename(img_path).replace(img_extension, '') img_base_path = img_path.replace(ntpath.basename(img_path), '') color_preset = self.data['color_presets'][self.options['color_preset']] logger.info('Color Preset (Carto Grayscale): ', {'color_preset': color_preset}) do_contour_normalization = bool( color_preset['building']['normalize_contours'] ) if 'normalize_contours' in color_preset['building'] else False image = cv2.imread(img_path, 1) fc_bgr_building_gray = color_preset['building']['fill']['gray'] fc_hsv_building_gray = bgr_color_to_hsv(fc_bgr_building_gray) if color_preset['building']['border']['type'] == 'relative': fc_hsv_building_gray_darker = self.transform_relative_color( fc_hsv_building_gray, color_preset['building']['border']['value']['gray']) else: fc_hsv_building_gray_darker = self.transform_color_string_to_float( color_preset['building']['border']['value']['gray']) logger.debug( self.logger_base_text + 'Color Info', { 'fill_color_bgr': { 'gray': fc_bgr_building_gray }, 'fill_color_hsv': { 'gray': fc_hsv_building_gray }, 'border_color_hsv': { 'gray': fc_hsv_building_gray_darker } }) # [Step-2] Do masking on HSV Image img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV) mask_gray = cv2.inRange(hsv, fc_hsv_building_gray, fc_hsv_building_gray_darker) final = cv2.bitwise_or(image, image, mask=mask_gray) # [Step-3] Find Contours json_contour_filepath = self.data['file']['json_contour'].replace( '<result_path>', self.data['path']['result']).replace( '<img_name>', img_name).replace('<preset>', 'carto-gs') json_contour_debug_filepath = self.data['file'][ 'json_contour_debug'].replace('<result_path>', self.data['path']['result']).replace( '<img_name>', img_name).replace( '<preset>', 'carto-gs') geojson_filepath = self.data['file']['geojson'].replace( '<result_path>', self.data['path']['result']).replace( '<img_name>', img_name).replace('<preset>', 'carto-gs') final_gray = cv2.cvtColor(final, cv2.COLOR_BGR2GRAY) final_blurred = cv2.GaussianBlur(final_gray, (3, 3), 0) ret, final_thresh = cv2.threshold(final_blurred, 127, 255, 0) contours, hierarchy = cv2.findContours(final_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # contour normalization if do_contour_normalization: contours = self.normalize_contours(contours) ctr_json_str = json.dumps( { 'contours': contours, 'hierarchy': hierarchy }, default=json_np_default_parser) ctr_json = json.loads(ctr_json_str) ctr_points = [] for cidx in range(len(ctr_json['contours'])): ctr_points.append( list(map(lambda x: x[0], ctr_json['contours'][cidx]))) # [Step - 4] Find Contours Geographic Coordinates geotiff_image = img_path.replace(img_extension, '.tif') translate_coords = GeoTiffProcessor.get_multi_polygon_axis_point_coordinates( geotiff_image, ctr_points, {'debug': False}) final_coords = [] geo_features = [] for poly in translate_coords['coords']: poly_coords = [] poly_geo_coords = [] for cr in poly: poly_coords.append({ 'x': cr['x'], 'y': cr['y'], 'latitude': cr['lat'], 'longitude': cr['long'] }) poly_geo_coords.append((cr['long'], cr['lat'])) # add final closing point poly_geo_coords.append((poly[0]['long'], poly[0]['lat'])) final_coords.append(poly_coords) geo_feature = Feature( geometry=Polygon([poly_geo_coords], precision=15)) geo_features.append(geo_feature) geo_feature_collection = FeatureCollection(geo_features) geo_feature_collection_dump = geojson_dumps(geo_feature_collection, sort_keys=True) with open(json_contour_filepath, 'w') as outfile: json.dump(final_coords, outfile) with open(geojson_filepath, 'w') as outfile: outfile.write(geo_feature_collection_dump) # [Step-5] Draw contours to original image clone final_wctrs = copy(image) for c in contours: cv2.drawContours(final_wctrs, [c], 0, color_preset['building']['contour'], 2) # Build result polygon_len = len(ctr_points) r = { 'file_path': geojson_filepath, 'file_size': str(get_file_size(geojson_filepath, SIZE_UNIT.KB)) + ' KB', 'polygon_total': polygon_len } if 'return_polygon_data' in opts and bool(opts['return_polygon_data']): r['geojson'] = json.loads(geo_feature_collection_dump) if self.options['save_result']: result_ftemplate = self.data['path'][ 'result'] + img_name + '-carto-gs-<fnm>' + img_extension self.write_image_results( result_ftemplate, '<fnm>', [('step-1-2-hsv-building-gray', fc_hsv_building_gray), ('step-2-image-bgr', image), ('step-3-image-rgb', img_rgb), ('step-4-0-hsv', hsv), ('step-4-1-hsv-mask-gray', mask_gray), ('step-5-final', final), ('step-6-image-gray', final_gray), ('step-7-final-blurred', final_blurred), ('step-8-final-thresh', final_thresh), ('step-9-image-final-with-contours', final_wctrs)]) if self.options['show_result']: show_image_results([ ("Step - 1-1 (HSV Gray Color)", np.uint8([[fc_hsv_building_gray]])), ("Step - 2 (Image - BGR)", image), ("Step - 3 ( Image - RGB)", img_rgb), ("Step - 4-0 (HSV)", hsv), ("Step - 4-1 (HSV - Gray)", mask_gray), ("Step - 5 (Final)", final), ("Step - 6 (Final - Gray)", final_gray), ("Step - 7 (Final - Gray Blurred)", final_blurred), ("Step - 8 (Final - Gray Thresh)", final_thresh), ("Step - 9 (Final - with contours)", final_wctrs) ]) # [Step - ending] Clean - up del contours, hierarchy, image, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, mask_gray, fc_hsv_building_gray return r else: # [Step - ending] Clean - up del contours, hierarchy, image, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, mask_gray, fc_hsv_building_gray return r
def run(self, server, devices, root_path, event_listener, on_update, index): filename_template_params = { 'timestamp': lambda _: time.time(), 'datetime': lambda _: time.strftime('%Y-%m-%dT%H:%M:%S-%Z'), 'filter': 'no-filter', 'filter_index': -1, } if 'filter_wheel' in devices and devices['filter_wheel']: filename_template_params['filter_index'], filename_template_params[ 'filter'] = devices['filter_wheel'].indi_sequence_filter_wheel( ).current_filter() upload_path = os.path.join(root_path, self.directory) self.save_directory = upload_path self.job_runner = ExposureSequenceJobRunner( server, devices['camera'].indi_sequence_camera(), self.exposure, self.count, upload_path, progress=self.progress, filename_template=self.filename, filename_template_params=filename_template_params, shots_pause=self.shots_pause, shots_group=self.shots_group, shots_group_pause=self.shots_group_pause) def on_started(job_runner): pass def on_each_started(job_runner, index): self.last_message = 'starting exposure {} out of {}'.format( index + 1, job_runner.count) on_update() def on_each_finished(job_runner, index, filename): self.last_message = 'finished exposure {} out of {}, saved to {}'.format( index + 1, job_runner.count, filename) on_update() def on_each_saved(job_runner, index, filename): logger.info('received file for index {}: {}'.format( index, filename)) image = Image(path=filename, file_required=True) self.progress = job_runner.finished self.saved_images.append(image.id) main_images_db.add(image) on_update() def on_finished(job_runner): self.last_message = 'finished.' on_update() self.progress = job_runner.finished self.job_runner = None logger.info('Starting job runner: {}, upload_path={}'.format( self.job_runner, upload_path)) self.job_runner.callbacks.add('on_started', on_started) self.job_runner.callbacks.add('on_each_started', on_each_started) self.job_runner.callbacks.add('on_each_finished', on_each_finished) self.job_runner.callbacks.add('on_each_saved', on_each_saved) self.job_runner.callbacks.add('on_finished', on_finished) try: self.job_runner.run() except: if self.job_runner: self.progress = self.job_runner.finished logger.warning('Error running exposures job') raise finally: self.job_runner = None
async def initialize(self): logger.info('Initializing MQTT connection') self.client.fast_mqtt.user_connect_handler = MQTTEventPublisher.on_connect self.client.fast_mqtt.client.on_disconnect = MQTTEventPublisher.on_disconnect await self.client.connect()
def get_geojson_osm(self, filepath: tuple = (), opts: dict = {}) -> dict: # [Step - 1] Get image + check color sampling img_path = os.path.join(BASE_DIR, filepath[0]) img_tiff_path = os.path.join(BASE_DIR, filepath[1]) img_extension = os.path.splitext(img_path)[1] img_name = ntpath.basename(img_path).replace(img_extension, '') img_base_path = img_path.replace(ntpath.basename(img_path), '') color_preset = self.data['color_presets'][self.options['color_preset']] logger.info('Color Preset (OSM): ', {'color_preset': color_preset}) do_contour_normalization = bool( color_preset['building']['normalize_contours'] ) if 'normalize_contours' in color_preset['building'] else False image_origin = cv2.imread(img_path, 1) if 'sharp_image' in color_preset['building']: sharp_img = self.unsharp_mask( image_origin, **color_preset['building']['sharp_image']) image_origin = copy(image_origin) image_new_contrast = [] if 'adjust_contrast' in color_preset['building']: image = cv2.convertScaleAbs( image_origin, alpha=color_preset['building']['adjust_contrast']['alpha'], beta=color_preset['building']['adjust_contrast']['beta']) image_new_contrast = [ cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-10), cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-20), cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-30), cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-50), cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-60) ] else: image = copy(image_origin) light_brown = np.uint8([[color_preset['building']['fill']]]) # Enhance image (ref: https://chrisalbon.com/machine_learning/preprocessing_images/enhance_contrast_of_greyscale_image/) # image = cv2.imread('images/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE) # image_enhanced = cv2.equalizeHist(image) # Convert BGR to HSV for masking color_codes = [] hsv_fill_color = cv2.cvtColor(light_brown, cv2.COLOR_BGR2HSV) # hsv_fill_color = cv2.cvtColor(light_brown, color_preset['building']['masking_color_mode']) # for index in hsv_fill_color: # color_codes = index[0] color_codes = hsv_fill_color[0][0] # [Step - 2] Do masking on HSV Image img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # img_rgb =copy(image) hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV) # hsv = cv2.cvtColor(img_rgb, color_preset['building']['masking_color_mode']) fill_color = (float(color_codes[0]), float(color_codes[1]), float(color_codes[2])) find_border_color = [] if color_preset['building']['border']['type'] == 'relative': temp = [] for idx, bbv in enumerate( color_preset['building']['border']['value'], 0): if bbv[0] == '+': temp.append(float(color_codes[idx]) + float(bbv[1:])) elif bbv[0] == '-': temp.append(float(color_codes[idx]) - float(bbv[1:])) else: temp.append(float(bbv)) border_color = tuple(temp) # border_color = (float(color_codes[0]) + color_preset['building']['border']['value'][0], float(color_codes[1]) + color_preset['building']['border']['value'][1], float(color_codes[2]) + color_preset['building']['border']['value'][2]) else: find_border_color = cv2.cvtColor( np.uint8([[color_preset['building']['border']['value']]]), cv2.COLOR_BGR2HSV) # find_border_color = cv2.cvtColor(np.uint8([[color_preset['building']['border']['value']]]), color_preset['building']['masking_color_mode']) border_color = (float(find_border_color[0][0][0]), float(find_border_color[0][0][1]), float(find_border_color[0][0][2])) logger.debug( self.logger_base_text + 'Color Info', { 'fill_color': fill_color, 'border_color': border_color, 'float_border_color': find_border_color, 'hsv_fill_color_codes': color_codes, 'hsv_fill_color': hsv_fill_color }) mask = cv2.inRange(hsv, fill_color, border_color) final = cv2.bitwise_and(image, image, mask=mask) # self.data['path']['result'] # self.data['file']['json_contour'] json_contour_filepath = self.data['file']['json_contour'].replace( '<result_path>', self.data['path']['result']).replace( '<img_name>', img_name).replace('<preset>', 'osm') json_contour_debug_filepath = self.data['file'][ 'json_contour_debug'].replace('<result_path>', self.data['path']['result']).replace( '<img_name>', img_name).replace( '<preset>', 'osm') geojson_filepath = self.data['file']['geojson'].replace( '<result_path>', self.data['path']['result']).replace( '<img_name>', img_name).replace('<preset>', 'osm') final_gray = cv2.cvtColor(final, cv2.COLOR_BGR2GRAY) final_blurred = cv2.GaussianBlur(final_gray, (5, 5), 0) ret, final_thresh = cv2.threshold(final_blurred, 127, 255, 0) contours, hierarchy = cv2.findContours(final_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # contour normalization if do_contour_normalization: contours = self.normalize_contours(contours) ctr_json_str = json.dumps( { 'contours': contours, 'hierarchy': hierarchy }, default=json_np_default_parser) ctr_json = json.loads(ctr_json_str) ctr_points = [] for cidx in range(len(ctr_json['contours'])): ctr_points.append( list(map(lambda x: x[0], ctr_json['contours'][cidx]))) # [Step - 4] Find Contours Geographic Coordinates geotiff_image = img_tiff_path translate_coords = GeoTiffProcessor.get_multi_polygon_axis_point_coordinates( geotiff_image, ctr_points, {'debug': False}) final_coords = [] geo_features = [] for poly in translate_coords['coords']: poly_coords = [] poly_geo_coords = [] for cr in poly: poly_coords.append({ 'x': cr['x'], 'y': cr['y'], 'latitude': cr['lat'], 'longitude': cr['long'] }) poly_geo_coords.append((cr['long'], cr['lat'])) # add final closing point poly_geo_coords.append((poly[0]['long'], poly[0]['lat'])) final_coords.append(poly_coords) geo_feature = Feature( geometry=Polygon([poly_geo_coords], precision=15)) geo_features.append(geo_feature) geo_feature_collection = FeatureCollection(geo_features) geo_feature_collection_dump = geojson_dumps(geo_feature_collection, sort_keys=True) with open(json_contour_filepath, 'w') as outfile: json.dump(final_coords, outfile) with open(geojson_filepath, 'w') as outfile: outfile.write(geo_feature_collection_dump) # [Step - 5] Draw contours to original image clone final_wctrs = copy( image ) # final_wctrs = copy(image_origin)# final_wctrs = copy(final) for c in contours: cv2.drawContours(final_wctrs, [c], 0, color_preset['building']['contour'], 2) # Build result polygon_len = len(ctr_points) r = { 'file_path': geojson_filepath, 'file_size': str(get_file_size(geojson_filepath, SIZE_UNIT.KB)) + ' KB', 'polygon_total': polygon_len } if 'return_polygon_data' in opts and bool(opts['return_polygon_data']): r['geojson'] = json.loads(geo_feature_collection_dump) if self.options['save_result']: result_ftemplate = self.data['path'][ 'result'] + img_name + '-<fnm>' + img_extension if 'sharp_image' in color_preset['building']: cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-0-sharpen-1'), sharp_img) if 'adjust_contrast' in color_preset['building']: cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-0-contrast-1'), image_new_contrast[0]) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-0-contrast-2'), image_new_contrast[1]) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-0-contrast-3'), image_new_contrast[2]) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-0-contrast-4'), image_new_contrast[3]) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-0-contrast-5'), image_new_contrast[4]) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-1-hsv-light-color'), hsv_fill_color) cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-2-image-bgr'), image) cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-3-image-rgb'), img_rgb) cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-4-hsv'), hsv) cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-5-final'), final) cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-6-image-gray'), final_gray) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-7-final-blurred'), final_blurred) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-8-final-thresh'), final_thresh) cv2.imwrite( result_ftemplate.replace('<fnm>', 'step-9-image-final-with-contours'), final_wctrs) if self.options['show_result']: cv2.imshow("Step - 1 (HSV Light Color)", hsv_fill_color) cv2.imshow("Step - 2 (Image - BGR)", image) cv2.imshow("Step - 3 ( Image - RGB)", img_rgb) cv2.imshow("Step - 4 (HSV)", hsv) cv2.imshow("Step - 5 (Final)", final) cv2.imshow("Step - 6 (Final - Gray)", final_gray) cv2.imshow("Step - 7 (Final - Gray Blurred)", final_blurred) cv2.imshow("Step - 8 (Final - Gray Thresh)", final_thresh) cv2.imshow("Step - 9 (Final - with contours)", final_wctrs) # cv2.imshow("Step - 10 (Final - with shape contours)", final_shape_ctrs) cv2.waitKey(0) cv2.destroyAllWindows() # [Step - ending] Clean - up del contours, hierarchy, image, hsv_fill_color, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, ctr_json, ctr_json_str, final_coords, geo_features, ctr_points return r else: # [Step - ending] Clean - up del contours, hierarchy, image, hsv_fill_color, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, ctr_json, ctr_json_str, final_coords, geo_features, ctr_points return r
def __init__(self, mqtt_host: str, mqtt_port: int, mqtt_client_name: str, app: FastAPI): logger.info(f'simulating a client to {mqtt_host}') self.mqtt_client_name = mqtt_client_name self.mqtt_host = mqtt_host self.mqtt_port = mqtt_port
def publish(self, topic, message): logger.info(f'simulated publishing to {topic}. message: {message}')
async def publish_state(self, customer_state: CustomerState): message = self.prepare_payload(customer_state) logger.info(f'Publishing {message}')
def scrape_green(self): # handle these special cases, where we compute the pdf rather than looking for it if "oai:arXiv.org" in self.pmh_id: self.scrape_metadata_url = self.url self.scrape_pdf_url = self.url.replace("abs", "pdf") if self.is_pmc: self.set_info_for_pmc_page() return # https://ink.library.smu.edu.sg/do/oai/ if self.endpoint and self.endpoint.id == 'ys9xnlw27yogrfsecedx' and u'ink.library.smu.edu.sg' in self.url: if u'viewcontent.cgi?' in self.url: return if self.pmh_record and find_normalized_license(self.pmh_record.license): self.scrape_metadata_url = self.url self.set_version_and_license() return if not self.scrape_pdf_url or not self.scrape_version: with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage: if not self.scrape_pdf_url: my_webpage.scrape_for_fulltext_link() self.error += my_webpage.error if my_webpage.is_open: logger.info(u"** found an open copy! {}".format(my_webpage.fulltext_url)) self.scrape_updated = datetime.datetime.utcnow().isoformat() self.scrape_metadata_url = self.url if my_webpage.scraped_pdf_url: self.scrape_pdf_url = my_webpage.scraped_pdf_url if my_webpage.scraped_open_metadata_url: self.scrape_metadata_url = my_webpage.scraped_open_metadata_url if my_webpage.scraped_license: self.scrape_license = my_webpage.scraped_license if my_webpage.scraped_version: self.scrape_version = my_webpage.scraped_version if self.scrape_pdf_url and not self.scrape_version: self.set_version_and_license(r=my_webpage.r) if self.scrape_pdf_url and not self.scrape_version: with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage: my_webpage.set_r_for_pdf() self.set_version_and_license(r=my_webpage.r) if self.is_open and not self.scrape_version: self.scrape_version = self.default_version() # associate certain landing page URLs with PDFs # https://repository.uantwerpen.be if self.endpoint and self.endpoint.id == 'mmv3envg3kaaztya9tmo': if self.scrape_pdf_url and self.scrape_pdf_url == self.scrape_metadata_url and self.pmh_record: logger.info(u'looking for landing page for {}'.format(self.scrape_pdf_url)) landing_urls = [u for u in self.pmh_record.urls if u'hdl.handle.net' in u] if len(landing_urls) == 1: logger.info(u'trying landing page {}'.format(landing_urls[0])) try: if http_get(landing_urls[0]).status_code == 200: self.scrape_metadata_url = landing_urls[0] except: pass if self.scrape_metadata_url: logger.info(u'set landing page {}'.format(self.scrape_metadata_url)) # https://lirias.kuleuven.be if (self.endpoint and self.endpoint.id == 'ycf3gzxeiyuw3jqwjmx3' and self.scrape_pdf_url == self.scrape_metadata_url and self.scrape_pdf_url and 'lirias.kuleuven.be' in self.scrape_pdf_url ): if self.pmh_record and self.pmh_record.bare_pmh_id and 'oai:lirias2repo.kuleuven.be:' in self.pmh_record.bare_pmh_id: self.scrape_metadata_url = 'https://lirias.kuleuven.be/handle/{}'.format( self.pmh_record.bare_pmh_id.replace('oai:lirias2repo.kuleuven.be:', '') )