Exemplo n.º 1
0
    def base_preprocessor_delete_single(instance_id=None, **kw):
        """Create a generic DELETE_SINGLE preprocessor.

        Accepts a single argument, `instance_id`, which is the primary key
        of the instance which will be deleted.
        """
        logger.info('`base_preprocessor_delete_single` used for endpoint')
Exemplo n.º 2
0
    def base_preprocessor_delete_many(search_params=None, **kw):
        """Create a generic DELETE_MANY preprocessor.

        Accepts a single argument, `search_params`, which is a dictionary
        containing the search parameters for the request.
        """
        logger.info('`base_preprocessor_delete_many` used for endpoint')
Exemplo n.º 3
0
    def base_preprocessor_post(data=None, **kw):
        """Create a generic POST preprocessor.

        Accepts a single argument, `data`, which is the dictionary of
        fields to set on the new instance of the model.
        """
        logger.info('`base_preprocessor_post` used for endpoint')
Exemplo n.º 4
0
    def base_preprocessor_get_many(search_params=None, **kw):
        """Create a generic GET_MANY preprocessor.

        Accepts a single argument, `search_params`, which is a dictionary
        containing the search parameters for the request.
        """
        logger.info('`base_preprocessor_get_many` responded to request')
Exemplo n.º 5
0
    def base_preprocessor_get_single(instance_id=None, **kw):
        """Create a generic GET_SINGLE preprocessor.

        Accepts a single argument, `instance_id`, the primary key of the
        instance of the model to get.
        """
        logger.info('`base_preprocessor_get_single` responded to request')
Exemplo n.º 6
0
 def run(self):
     logger.info('Starting the heartbeat thread')
     while not self.stop:
         self.process._write_command_('PING')
         self.process.check_alive()
         time.sleep(5)
     logger.info('Stopping the heartbeat thread')
Exemplo n.º 7
0
	def __init__(self, uuid):
		self.uuid = uuid # + uuid4().hex
		self.keymanager = RedisKeyManager(uuid)				
		self.buy_orders = list()
		self.sell_orders = list()
		self.redis = Redis()
		logger.info('Initializing orderbook: %s'%self.uuid)
Exemplo n.º 8
0
    def post(self):
        '''!Post a json object which includes username and password to create a new user
        {
            "username":"******",
            "password":"******"
        }
        :return: status code 201 - create success
        :return: status code 200 - username and password can't be empty
        :return: status code 409 - user already exists

        '''

        username = self.args['username']
        password = self.args['password']


        if username == '' or password == '':
            return {"message": "username and password can't be empty"}, 400

        if Users.query.filter_by(username=username).first() is not None:
            logger.info('user already exist!')
            abort(409, 'user already exist!')

        newuser = Users(username, password)
        db.session.add(newuser)
        db.session.commit()
        logger.info('create user : ' + username)
        return {"message": "created"}, 201
Exemplo n.º 9
0
    def harvest(self, **kwargs):  # pragma: no cover
        """Make HTTP requests to the OAI server.
        :param kwargs: OAI HTTP parameters.
        :rtype: :class:`sickle.OAIResponse`
        """
        start_time = time()
        for _ in range(self.max_retries):
            if self.http_method == 'GET':
                payload_str = "&".join("%s=%s" % (k,v) for k,v in kwargs.items())
                url_without_encoding = u"{}?{}".format(self.endpoint, payload_str)
                http_response = requests.get(url_without_encoding,
                                             **self.request_args)
                self.http_response_url = http_response.url
            else:
                http_response = requests.post(self.endpoint, data=kwargs,
                                              **self.request_args)
                self.http_response_url = http_response.url
            if http_response.status_code == 503:
                retry_after = self.RETRY_SECONDS
                logger.info("HTTP 503! Retrying after %d seconds..." % retry_after)
                sleep(retry_after)
            else:
                logger.info("took {} seconds to call pmh url: {}".format(elapsed(start_time), http_response.url))

                http_response.raise_for_status()
                if self.encoding:
                    http_response.encoding = self.encoding
                return OAIResponse(http_response, params=kwargs)
Exemplo n.º 10
0
def login():
    """This function logs a user into the system.
        Upon a GET request a LoginForm will be shown to the user.
        Upon a POST request the form will be validated and if valid the users
            specified password will be hashed and compared to the stored
            password.
            Should they be equal the user will be logged in (as such
                his User object will be stored in the session) and redirected to
                    the default page of the authentication-module.
                Is this not the case or if the form was invalid in the first
                    place, he will be shown the form again.
    """
    form = LoginForm(request.form)

    if request.method == 'POST' and form.validate():
        user = User.objects(username = form.username.data).first()
        if user is not None:
            if user.password == generateHash(form.password.data):
                session['user'] = user
                session['currency'] = u"\u20AC"
                return redirect(session.get('next', url_for('budget.showSummary')))

        logger.info('User %s has logged in.' % user.username)
        flash('The specified username and/or password were incorrect.')
    return render_template('auth/login.html', form = form)
Exemplo n.º 11
0
def delete_user(id):
    if not g.user.is_admin():
        logger.error("%s tried to access /delete-user/%d", g.user.email, id)
        abort(403)

    user = User.query.get_or_404(id)

    if user.is_admin():
        flash("Cannot delete the admin")
        return redirect(url_for('user_list'))

    form = DeleteUserForm()

    if form.validate_on_submit():
        if request.form['button'] == 'Cancel':
            return form.redirect(url_for('user_list'))
        
        logger.info("%s was deleted", user.email)

        db.session.delete(user)
        db.session.commit()

        flash("User deleted successfully")
        return redirect(url_for('user_list'))

    return render_template('admin_delete_user.html',
            Title = "Delete user",
            form = form,
            user = user)
Exemplo n.º 12
0
def get(request, response):
    logger.info("Handling request")
    
    import time
    #    time.sleepBAD(10)
    response.setStatus(202)
    response.write("Success\n", "text/plain")
Exemplo n.º 13
0
 def delTag(self, data):
     for del_tag in data:
         remove_tag = Tags.query.filter_by(tag_name=del_tag).first()
         remove_tag.tag_count -= 1
         logger.info("remove_tags=%s" % remove_tag.tag_name)
         if remove_tag is not None:
             self.tags.remove(remove_tag)
Exemplo n.º 14
0
def keep_redirecting(r, publisher):
    # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers

    # 10.5762/kais.2016.17.5.316
    if ("content-length" in r.headers):
        # manually follow javascript if that's all that's in the payload
        file_size = int(r.headers["content-length"])
        if file_size < 500:
            matches = re.findall(ur"<script>location.href='(.*)'</script>", r.content_small(), re.IGNORECASE)
            if matches:
                redirect_url = matches[0]
                if redirect_url.startswith(u"/"):
                    redirect_url = get_link_target(redirect_url, r.url)
                return redirect_url

    # 10.1097/00003643-201406001-00238
    if publisher and is_same_publisher(publisher, "Ovid Technologies (Wolters Kluwer Health)"):
        matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(), re.IGNORECASE)
        if matches:
            an_number = matches[0]
            redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(an_number)
            return redirect_url

    # handle meta redirects
    redirect_re = re.compile('<meta[^>]*?url=["\'](.*?)["\']', re.IGNORECASE)
    redirect_match = redirect_re.findall(r.content_small())
    if redirect_match:
        redirect_path = HTMLParser().unescape(redirect_match[0].strip())
        redirect_url = urlparse.urljoin(r.request.url, redirect_path)
        logger.info(u"redirect_match! redirecting to {}".format(redirect_url))
        return redirect_url

    return None
Exemplo n.º 15
0
def register():
    if request.method == 'POST':
        logger.info('Registration POST: %s%s%s'\
                    % (request.form['email'], \
                       request.form['name'],\
                       request.form['tel']))
        user = User()
        user.name = request.form.get('name', None)
        user.email = request.form.get('email', None)
        user.tel = request.form.get('tel', None)
        user.msg = request.form.get('message', None)

        if not user.is_valid:
            logger.error('No valid form. Request:%s' % request)
            return jsonify(False)

        try:
            user.save()
        except:
            logger.error('Don\'t save in base. Request:%s' % request)
            return jsonify('Error')
        
        logger.info('Register:Done!')
        send_email(user)
        return jsonify(True)

    else:
        return jsonify(False)
Exemplo n.º 16
0
def check_pdf_urls(pdf_urls):
    for url in pdf_urls:
        make_transient(url)

    # free up the connection while doing net IO
    safe_commit(db)
    db.engine.dispose()

    req_pool = get_request_pool()

    checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1)
    req_pool.close()
    req_pool.join()

    row_dicts = [x.__dict__ for x in checked_pdf_urls]
    for row_dict in row_dicts:
        row_dict.pop('_sa_instance_state')

    db.session.bulk_update_mappings(PdfUrl, row_dicts)

    start_time = time()
    commit_success = safe_commit(db)
    if not commit_success:
        logger.info(u"COMMIT fail")
    logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
Exemplo n.º 17
0
def save_feedback_response(bound_logger, survey_feedback_response):
    bound_logger.info("Saving feedback response")
    survey = survey_feedback_response.get("survey_id")
    period = survey_feedback_response.get("collection", {}).get("period")

    invalid = survey_feedback_response.get("invalid")
    if invalid:
        survey_feedback_response.pop("invalid")

    feedback_response = FeedbackResponse(invalid=invalid,
                                         data=survey_feedback_response,
                                         survey=survey,
                                         period=period)

    try:
        db.session.add(feedback_response)
        db.session.commit()
    except IntegrityError as e:
        logger.error("Integrity error in database. Rolling back commit", error=e)
        db.session.rollback()
        raise e
    except SQLAlchemyError as e:
        logger.error("Unable to save response", error=e)
        db.session.rollback()
        raise e
    else:
        logger.info("Feedback response saved")

    return invalid
Exemplo n.º 18
0
def generateSSLCert():
    if not os.path.exists(os.path.join(config.DATA_DIR, 'plexivity.key')) or not os.path.exists(os.path.join(config.DATA_DIR, 'plexivity.crt')):
        logger.warning("plexivity was started with ssl support but no cert was found, trying to generating cert and key now")
        try:
            from OpenSSL import crypto, SSL
            from socket import gethostname

            
            # create a key pair
            k = crypto.PKey()
            k.generate_key(crypto.TYPE_RSA, 1024)
    
            # create a self-signed cert
            cert = crypto.X509()
            cert.get_subject().C = "US"
            cert.get_subject().ST = "plex land"
            cert.get_subject().L = "plex land"
            cert.get_subject().O = "plexivity"
            cert.get_subject().OU = "plexivity"
            cert.get_subject().CN = gethostname()
            cert.set_serial_number(1000)
            cert.gmtime_adj_notBefore(0)
            cert.gmtime_adj_notAfter(10*365*24*60*60)
            cert.set_issuer(cert.get_subject())
            cert.set_pubkey(k)
            cert.sign(k, 'sha1')
    
            open(os.path.join(config.DATA_DIR, 'plexivity.crt'), "wt").write(crypto.dump_certificate(crypto.FILETYPE_PEM, cert))
            open(os.path.join(config.DATA_DIR, 'plexivity.key'), "wt").write(crypto.dump_privatekey(crypto.FILETYPE_PEM, k))
            logger.info("ssl cert and key generated and saved to: %s" % config.DATA_DIR)
        except:
            logger.error("unable to generate ssl key and cert")
Exemplo n.º 19
0
def get_pdf_url_status(pdf_url):
    worker = current_process()
    logger.info(u'{} checking pdf url: {}'.format(worker, pdf_url))

    is_pdf = False
    http_status = None

    try:
        response = http_get(
            url=pdf_url.url, ask_slowly=True, stream=True,
            publisher=pdf_url.publisher, session_id=get_session_id()
        )
    except Exception as e:
        logger.error(u"{} failed to get response: {}".format(worker, e.message))
    else:
        with response:
            try:
                is_pdf = is_a_pdf_page(response, pdf_url.publisher)
                http_status = response.status_code
            except Exception as e:
                logger.error(u"{} failed reading response: {}".format(worker, e.message))

    pdf_url.is_pdf = is_pdf
    pdf_url.http_status = http_status
    pdf_url.last_checked = datetime.utcnow()

    logger.info(u'{} updated pdf url: {}'.format(worker, pdf_url))

    return pdf_url
Exemplo n.º 20
0
def get_multiple_pubs_response():
    is_person_who_is_making_too_many_requests = False

    biblios = []
    body = request.json
    if "dois" in body:
        if len(body["dois"]) > 25:
            abort_json(413, "max number of DOIs is 25")
        if len(body["dois"]) > 1:
            is_person_who_is_making_too_many_requests = True
        for doi in body["dois"]:
            biblios += [{"doi": doi}]
            if u"jama" in doi:
                is_person_who_is_making_too_many_requests = True

    elif "biblios" in body:
        for biblio in body["biblios"]:
            biblios += [biblio]

        if len(body["biblios"]) > 1:
            is_person_who_is_making_too_many_requests = True

    logger.info(u"in get_multiple_pubs_response with {}".format(biblios))

    run_with_hybrid = g.hybrid
    if is_person_who_is_making_too_many_requests:
        logger.info(u"is_person_who_is_making_too_many_requests, so returning 429")
        abort_json(429, u"sorry, you are calling us too quickly.  Please email [email protected] so we can figure out a good way to get you the data you are looking for.")
    pubs = pub.get_pubs_from_biblio(biblios, run_with_hybrid)
    return pubs
Exemplo n.º 21
0
def startScheduler():
    db.create_all()
    #create default roles!
    if not db.session.query(models.Role).filter(models.Role.name == "admin").first():
        admin_role = models.Role(name='admin', description='Administrator Role')
        user_role = models.Role(name='user', description='User Role')
        db.session.add(admin_role)
        db.session.add(user_role)
        db.session.commit()
        
    try:
        import tzlocal

        tz = tzlocal.get_localzone()
        logger.info("local timezone: %s" % tz)
    except:
        tz = None

    if not tz or tz.zone == "local":
        logger.error('Local timezone name could not be determined. Scheduler will display times in UTC for any log'
                 'messages. To resolve this set up /etc/timezone with correct time zone name.')
        tz = pytz.utc
    #in debug mode this is executed twice :(
    #DONT run flask in auto reload mode when testing this!
    scheduler = BackgroundScheduler(logger=sched_logger, timezone=tz)
    scheduler.add_job(notify.task, 'interval', seconds=config.SCAN_INTERVAL, max_instances=1,
                      start_date=datetime.datetime.now(tz) + datetime.timedelta(seconds=2))
    scheduler.start()
    sched = scheduler
Exemplo n.º 22
0
    def crop_postprocessor_update_single(result=None, **kw):
        """Create an Crop specific PATCH_SINGLE and PUT_SINGLE postprocessor.

        Accepts a single argument, `result`, which is the dictionary
        representation of the requested instance of the model.
        """
        logger.info('`crop_postprocessor_update_single` used for endpoint')
Exemplo n.º 23
0
    def crop_postprocessor_get_single(result=None, **kw):
        """Create an Crop specific GET_SINGLE postprocessor.

        Accepts a single argument, `result`, which is the dictionary
        representation of the requested instance of the model.
        """
        logger.info('`crop_postprocessor_get_single` responded to request')
Exemplo n.º 24
0
    def crop_postprocessor_post(result=None, **kw):
        """Create an Crop specific POST postprocessor.

        Accepts a single argument, `result`, which is the dictionary
        representation of the created instance of the model.
        """
        logger.info('`crop_postprocessor_post` used for endpoint')
Exemplo n.º 25
0
    def crop_postprocessor_delete_single(was_deleted=None, **kw):
        """Create an Crop specific DELETE_SINGLE postprocessor.

        Accepts a single argument, `was_deleted`, which represents whether
        the instance has been deleted.
        """
        logger.info('`crop_postprocessor_delete_single` used for endpoint')
Exemplo n.º 26
0
def get_statistics():
    """Gathers all user's statistics.

    It gathers all statistics about user activity(qantity of memorized words,
    quantity of passed test, average grade and quantity of passed tests per
    week).

    :Route:
        '/api/user/statistic'.

    :Methods:
        GET.

    :Returns:
        json object with all user's statistics.
    """
    uid = flask_login.current_user.uid
    tests = db.get_user_tests(uid)
    tests_taken = len(tests)
    logger.info(tests)
    average_grade = reduce(lambda x, y: x + y,
                           [item[0] for item in tests]) / tests_taken
    words = db.get_words(uid, True)
    words_memorized = len(words)
    tests_per_week = utils.get_tests_count_per_week(tests)
    result = {'tests_taken': tests_taken, 'average_grade': average_grade,
              'words_memorized': words_memorized,
              'tests_per_week': tests_per_week}
    return Response(json.dumps(result), mimetype='application/json',
                    status=200)
Exemplo n.º 27
0
def login():
  import hashlib
  if request.method == 'GET':
     return render_template("login.html")

  email = request.form["email"]
  password = request.form["password"]

  m = hashlib.md5()
  m.update(password)

  userinfo = User.query.filter(User.email == email).first()
  if userinfo is None:
    msg = '用户不存在, 请先注册'
    logger.info(msg)
    return render_template("login.html", email = email, msg = msg)

  userinfo = User.query.filter(and_(User.email == email, User.password == m.hexdigest())).first()
  if userinfo is None:
    msg = '用户名密码不正确'
    password = ''
    return render_template("login.html", email = email, password = password, msg = msg)
  else:
    """do nothing"""

  print 'login successful'
  response = make_response(redirect('/'))
  secure_token = create_token(userinfo.id, userinfo.email, request.user_agent)

  response.set_cookie('secure_token', value=secure_token, max_age=2592000)
  login_user(userinfo)
  return response
Exemplo n.º 28
0
def is_a_pdf_page(response, page_publisher):
    if is_pdf_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a PDF {}".format(
                response.request.url)
            )
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(u"response is too big for more checks in is_a_pdf_page")
        return False

    content = response.content_big()

    # PDFs start with this character
    if re.match(u"%PDF", content):
        return True

    if page_publisher:
        says_free_publisher_patterns = [
            ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'),
            ("Wiley-Blackwell", u'<iframe id="pdfDocument"'),
            ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'),
            ("Institute of Electrical and Electronics Engineers (IEEE)",
             ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'),
            ("IOP Publishing", ur'Full Refereed Journal Article')
        ]
        for (publisher, pattern) in says_free_publisher_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if is_same_publisher(page_publisher, publisher) and matches:
                return True
    return False
Exemplo n.º 29
0
def add_dois_to_queue_from_query(where, job_type):
    logger.info(u"adding all dois, this may take a while")
    start = time()

    table_name = "doi_queue"

    # run_sql(db, "drop table {} cascade".format(table_name(job_type)))
    # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format(
    #     table_name(job_type))
    create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format(
        table_name)

    if where:
        create_table_command = create_table_command.replace("from pub)", "from pub where {})".format(where))
    run_sql(db, create_table_command)
    create_table_command += """
        alter table {table_name} alter column rand set default random();
        CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id);
        CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null;
        CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null;
        -- from https://lob.com/blog/supercharge-your-postgresql-performance
        -- vacuums and analyzes every ten million rows
        ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000);
        ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000);
        """.format(
        table_name=table_name)
    for command in create_table_command.split(";"):
        run_sql(db, command)

    command = """create or replace view export_queue as
     SELECT id AS doi,
        updated AS updated,
        response_jsonb->>'evidence' AS evidence,
        response_jsonb->>'oa_status' AS oa_color,
        response_jsonb->>'free_fulltext_url' AS best_open_url,
        response_jsonb->>'year' AS year,
        response_jsonb->>'found_hybrid' AS found_hybrid,
        response_jsonb->>'found_green' AS found_green,
        response_jsonb->>'error' AS error,
        response_jsonb->>'is_boai_license' AS is_boai_license,
        replace(api->'_source'->>'journal', '
    ', '') AS journal,
        replace(api->'_source'->>'publisher', '
    ', '') AS publisher,
        api->'_source'->>'title' AS title,
        api->'_source'->>'subject' AS subject,
        response_jsonb->>'license' AS license
       FROM pub where id in (select id from {table_name})""".format(
        table_name=table_name(job_type))

    # if job_type:
    #     command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid")
    run_sql(db, command)

    # they are already lowercased
    logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(elapsed(start, 1)))
    print_status(job_type)
Exemplo n.º 30
0
    def post(self):
        '''!Receive consent receipt
        :return: status code 409 - already exists
        :return: status code 201 - created
        '''
        logger.info(json.loads(request.get_data()))

        for item in self.receipt:
            self.receipt[item] = request.json.get(item)

        receipt = Receipts.query.filter_by(
            consent_receipt_id=self.receipt['consentReceipt']['consent_receipt_id']).first()

        # if receipt exist updated it
        if receipt is not None:
            logger.debug('Receipt already exist!')

            # TODO still return 409 after demo
            # abort(409, 'Receipt already exist!')

            #=====start=====
            receipt.rpt = self.receipt['rpt']
            receipt.rs_id = self.receipt['consentReceipt']['rs_id'],
            receipt.consent_receipt_id = self.receipt['consentReceipt']['consent_receipt_id']
            receipt.service_contract_id = self.receipt['consentReceipt']['service_contract_id']
            receipt.authorization_status = self.receipt['consentReceipt']['authorization_status']
            receipt.data_usage_license = self.receipt['consentReceipt']['data_usage_license']
            receipt.consent_summary = json.dumps(self.receipt['consentReceipt']['consent_summary'])
            receipt.update_time = datetime.datetime.now()
            db.session.add(receipt)
            db.session.commit()
            return {'message': 'updated'}, 201
            #======end=====

        for item in self.receipt['consentReceipt']:
            if item is None:
                logger.debug(item + 'can not be none!')
                abort(409, item + 'can not be none!')
                # receipt[item] = self.receipt[item]

        receipt = Receipts(
            self.receipt['rpt'],
            self.receipt['consentReceipt']['rs_id'],
            str(self.receipt['consentReceipt']['consent_receipt_id']),
            str(self.receipt['consentReceipt']['service_contract_id']),
            self.receipt['consentReceipt']['authorization_status'],
            str(self.receipt['consentReceipt']['data_usage_license']),
            json.dumps(self.receipt['consentReceipt']['consent_summary']))

        mapping = Mappings(
            self.receipt['consentReceipt']['account_id'],
            str(self.receipt['consentReceipt']['consent_receipt_id']),
            datetime.datetime.now()
        )

        db.session.add(mapping)
        db.session.add(receipt)
        db.session.commit()
        return {'message': 'created'}, 201
Exemplo n.º 31
0
def export_with_versions(do_all=False,
                         job_type="normal",
                         filename=None,
                         view=None,
                         week=False,
                         json=False):

    # ssh -i /Users/hpiwowar/Dropbox/ti/certificates/aws-data-export.pem [email protected]
    # aws s3 cp test.txt s3://mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/test.txt

    # connect to our bucket
    (conn, ssh_client) = login_to_aws()

    # to connect to clarivate's bucket
    # clarivate_conn = boto.ec2.connect_to_region('us-east-2')
    # clarivate_instance = clarivate_conn.get_all_instances()[0].instances[0]
    # clarivate_ssh_client = sshclient_from_instance(clarivate_instance, "/Users/hpiwowar/Dropbox/ti/certificates/aws-data-export.pem", user_name="ec2-user")

    logger.info(u"log in done")

    now_timestamp = datetime.datetime.utcnow().isoformat()[0:19].replace(
        ":", "")
    if not filename:
        filename = "all_dois_{}.csv".format(now_timestamp)

    today = datetime.datetime.utcnow()
    if week:
        last_week = today - datetime.timedelta(days=9)
        view = "export_main_changed_with_versions where last_changed_date >= '{}'::timestamp and updated > '1043-01-01'::timestamp".format(
            last_week.isoformat()[0:19])
        filename = "changed_dois_with_versions_{}_to_{}.csv".format(
            last_week.isoformat()[0:19],
            today.isoformat()[0:19]).replace(":", "")
    else:
        filename = "dois_with_versions_{}.csv".format(
            today.isoformat()[0:19]).replace(":", "")

    if not view:
        view = "export_main_changed_with_versions"

    command = """psql {}?ssl=true -c "\copy (select * from {}) to '{}' WITH (FORMAT CSV, HEADER);" """.format(
        os.getenv("DATABASE_URL"), view, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    command = """gzip -c {} > {}.gz;""".format(filename, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    command = """date -r {}.gz;""".format(filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))
    gz_modified = stdout.strip()

    # command = """aws s3 cp {}.gz s3://mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/{}.gz --acl public-read --metadata "modifiedtimestamp='{}'";""".format(
    #     filename, filename, gz_modified)
    command = """aws s3 cp {}.gz s3://oadoi-for-clarivate/{}.gz --acl public-read --metadata "modifiedtimestamp='{}'";""".format(
        filename, filename, gz_modified)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    # also make a .DONE file
    # how to calculate a checksum http://www.heatware.net/linux-unix/how-to-create-md5-checksums-and-validate-a-file-in-linux/
    command = """md5sum {}.gz > {}.gz.DONE;""".format(filename, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    command = """date -r {}.gz;""".format(filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))
    gz_done_modified = stdout.strip()

    # copy up the .DONE file
    # command = """aws s3 cp {}.gz.DONE s3://mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/{}.gz.DONE --acl public-read --metadata "modifiedtimestamp='{}'";""".format(
    #     filename, filename, gz_done_modified)
    command = """aws s3 cp {}.gz.DONE s3://oadoi-for-clarivate/{}.gz.DONE --acl public-read --metadata "modifiedtimestamp='{}'";""".format(
        filename, filename, gz_done_modified)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    # logger.info(u"now go to *** https://console.aws.amazon.com/s3/object/mpr-ims-harvestor/mpr-ims-dev/harvestor_staging_bigBatch/OA/{}.gz?region=us-east-1&tab=overview ***".format(
    #     filename))
    logger.info(
        u"public link is at *** https://s3-us-west-2.amazonaws.com/oadoi-for-clarivate/{}.gz ***"
        .format(filename))

    conn.close()
Exemplo n.º 32
0
def export_no_versions(do_all=False,
                       job_type="normal",
                       filename=None,
                       view="export_main_no_versions",
                       week=False,
                       json=False):
    (conn, ssh_client) = login_to_aws()

    logger.info(u"log in done")

    today = datetime.datetime.utcnow()

    if week:
        last_week = today - datetime.timedelta(days=9)
        if json:
            view = "pub where last_changed_date >= '{}'::timestamp and updated > '1043-01-01'::timestamp".format(
                last_week.isoformat()[0:19])
            filename = "changed_dois_{}_to_{}.jsonl".format(
                last_week.isoformat()[0:19],
                today.isoformat()[0:19]).replace(":", "")
        else:
            view = "export_main_changed_no_versions where last_changed_date >= '{}'::timestamp and updated > '1043-01-01'::timestamp".format(
                last_week.isoformat()[0:19])
            filename = "changed_dois_{}_to_{}.csv".format(
                last_week.isoformat()[0:19],
                today.isoformat()[0:19]).replace(":", "")
    else:
        if json:
            filename = "full_dois_{}.jsonl".format(
                today.isoformat()[0:19]).replace(":", "")
        else:
            filename = "full_dois_{}.csv".format(
                today.isoformat()[0:19]).replace(":", "")

    if json:
        command = """psql {}?ssl=true -c "\copy (select response_jsonb from {}) to '{}';" """.format(
            os.getenv("DATABASE_URL"), view, filename)
    else:
        command = """psql {}?ssl=true -c "\copy (select * from {}) to '{}' WITH (FORMAT CSV, HEADER);" """.format(
            os.getenv("DATABASE_URL"), view, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    if json:
        command = """sed -i 's/"publishedVersion"/null/g; s/"submittedVersion"/null/g; s/"acceptedVersion"/null/g' {}""".format(
            filename)
        logger.info(command)
        status, stdout, stderr = ssh_client.run(command)
        logger.info(u"{} {} {}".format(status, stdout, stderr))

    command = """gzip -c {} > {}.gz; date;""".format(filename, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    command = """aws s3 cp {}.gz s3://unpaywall-data-updates/{}.gz --acl public-read; date; """.format(
        filename, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    # also make a .DONE file
    # how to calculate a checksum http://www.heatware.net/linux-unix/how-to-create-md5-checksums-and-validate-a-file-in-linux/
    command = """md5sum {}.gz > {}.gz.DONE; date;""".format(filename, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    # copy up the .DONE file
    command = """aws s3 cp {}.gz.DONE s3://unpaywall-data-updates/{}.gz.DONE --acl public-read; date;""".format(
        filename, filename)
    logger.info(command)
    status, stdout, stderr = ssh_client.run(command)
    logger.info(u"{} {} {}".format(status, stdout, stderr))

    logger.info(
        u"now go to *** https://console.aws.amazon.com/s3/object/unpaywall-data-updates/{}.gz?region=us-east-1&tab=overview ***"
        .format(filename))
    logger.info(
        u"public link is at *** https://s3-us-west-2.amazonaws.com/unpaywall-data-updates/{}.gz ***"
        .format(filename))

    conn.close()
Exemplo n.º 33
0
def on_connect(client, userdata, flag,rc):
    print("Connected with result code "+str(rc))   
    logger.info("Connected with result code "+str(rc)) 
    client.subsrcibe("Modbus\Received")
    Mqtt_Stat.value =  rc
Exemplo n.º 34
0
 def subscribe(self):
     new_client = SSEClient(self)
     self.clients.append(new_client)
     logger.info('new client subscribed: {}'.format(new_client))
     return new_client
Exemplo n.º 35
0
def print_ip():
    user_agent = request.headers.get('User-Agent')
    logger.info(u"calling from IP {ip}. User-Agent is '{user_agent}'.".format(
        ip=get_ip(), user_agent=user_agent))
Exemplo n.º 36
0
    def fetch_queue_chunk(self, chunk_size, scrape_publisher):
        logger.info(u"looking for new jobs")

        if scrape_publisher:
            pmh_value_filter = "and pmh_id = '{}'".format(publisher_equivalent_pmh_id)
        else:
            pmh_value_filter = "and pmh_id is distinct from '{}'".format(publisher_equivalent_pmh_id)

        text_query_pattern = """
            with update_chunk as (
                select
                    lru_by_endpoint.id
                    from
                        endpoint e
                        cross join lateral (
                            select qt.*
                            from
                                {queue_table} qt
                                join page_new p using (id)
                            where
                                qt.endpoint_id = e.id
                                and qt.started is null
                                and (qt.finished is null or qt.finished < now() - '1 day'::interval)
                                and qt.endpoint_id is distinct from '{biorxiv_id}'
                                {pmh_value_filter}
                            order by qt.finished asc nulls first
                            limit 1
                            for update of qt skip locked
                        ) lru_by_endpoint
                    order by finished asc nulls first, rand
                    limit {chunk_size}
            )
            update {queue_table} queue_rows_to_update
            set started=now()
            from update_chunk
            where update_chunk.id = queue_rows_to_update.id
            returning update_chunk.id;
        """

        text_query = text_query_pattern.format(
            chunk_size=chunk_size,
            queue_table=self.table_name(None),
            pmh_value_filter=pmh_value_filter,
            biorxiv_id=biorxiv_endpoint_id
        )

        logger.info(u"the queue query is:\n{}".format(text_query))

        job_time = time()
        row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall()
        object_ids = [row[0] for row in row_list]
        logger.info(u"got {} ids, took {} seconds".format(len(object_ids), elapsed(job_time)))

        job_time = time()
        q = db.session.query(PageNew).options(
            orm.undefer('*')
        ).filter(PageNew.id.in_(object_ids))

        objects = q.all()
        logger.info(u"got page_new objects in {} seconds".format(elapsed(job_time)))

        return objects
Exemplo n.º 37
0
def scroll_through_all_dois(query_doi=None,
                            first=None,
                            last=None,
                            today=False,
                            week=False,
                            chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {
        "Accept": "application/json",
        "User-Agent": "mailto:[email protected]"
    }

    if first:
        base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"
    else:
        base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    dois_from_api = []
    number_added = 0

    start_time = time()
    while has_more_responses:
        has_more_responses = False

        start_time = time()
        url = base_url.format(first=first,
                              last=last,
                              rows=chunk_size,
                              next_cursor=next_cursor)
        logger.info(u"calling url: {}".format(url))

        resp = requests.get(url, headers=headers)
        logger.info(
            u"getting crossref response took {} seconds.  url: {}".format(
                elapsed(start_time, 2), url))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(
                resp.status_code))
            return number_added

        resp_data = resp.json()["message"]
        next_cursor = resp_data.get("next-cursor", None)
        if next_cursor:
            next_cursor = quote(next_cursor)
            if resp_data["items"] and len(resp_data["items"]) == chunk_size:
                has_more_responses = True

        dois_from_api = [
            clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]
        ]
        added_pubs = add_new_pubs_from_dois(dois_from_api)
        if dois_from_api:
            logger.info(u"got {} dois from api".format(len(dois_from_api)))
        if added_pubs:
            logger.info(u"{}: saved {} new pubs, including {}".format(
                first, len(added_pubs), added_pubs[-2:]))

        number_added += len(added_pubs)

        logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2)))

    return number_added
Exemplo n.º 38
0
        async def connect(self):
            logger.info("before connect")

            self.fast_mqtt.init_app(self.app)
            logger.info("after connect")
Exemplo n.º 39
0
 def publish(self, topic, message):
     logger.info(f' publishing to {topic}. message: {message}')
     return self.fast_mqtt.publish(topic, message)
Exemplo n.º 40
0
    def set_version_and_license(self, r=None):
        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = self.default_version()

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself
        version_is_from_strict_metadata = self.pmh_record and self.pmh_record.api_raw and re.compile(
            ur"<dc:type>{}</dc:type>".format(self.scrape_version), re.IGNORECASE | re.MULTILINE | re.DOTALL
        ).findall(self.pmh_record.api_raw)

        if version_is_from_strict_metadata or not r:
            logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)
            # logger.info(text)

            if text and self.scrape_version != "publishedVersion" and not version_is_from_strict_metadata:
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is brought to you for free and open access by Works.", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern))
                        self.scrape_version = "publishedVersion"

            if text and self.scrape_version != 'acceptedVersion':
                patterns = [
                    re.compile(ur'This is a post-peer-review, pre-copyedit version', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'This is the peer reviewed version of the following article', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'The present manuscript as of \d\d \w+ \d\d\d\d has been accepted', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur'Post-peer-review, pre-copyedit version of accepted manuscript', re.IGNORECASE | re.MULTILINE | re.DOTALL),
                ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is accepted version'.format(pattern.pattern))
                        self.scrape_version = "acceptedVersion"

                if r and r.url and '61RMIT_INST' in r.url:
                    if 'Version: Accepted' in text:
                        logger.info(u'found Version: Accepted, decided PDF is accepted version')
                        self.scrape_version = "acceptedVersion"

                heading_text = text[0:50].lower()
                accepted_headings = [
                    "final accepted version",
                    "accepted manuscript",
                ]

                for heading in accepted_headings:
                    if heading in heading_text:
                        logger.info(u'found {} in heading, decided PDF is accepted version'.format(heading))
                        self.scrape_version = "acceptedVersion"
                        break

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    logger.info(u'found license in PDF: {}'.format(open_license))
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)

        if self.pmh_record:
            self.scrape_version = _scrape_version_override().get(self.pmh_record.bare_pmh_id, self.scrape_version)

        logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
Exemplo n.º 41
0
 def print_status(self, job_type):
     num_dois = self.number_total_on_queue(job_type)
     num_waiting = self.number_waiting_on_queue(job_type)
     if num_dois:
         logger.info(u"There are {} dois in the queue, of which {} ({}%) are waiting to run".format(
             num_dois, num_waiting, int(100*float(num_waiting)/num_dois)))
Exemplo n.º 42
0
    def monitor_till_done(self, job_type):
        logger.info(u"collecting data. will have some stats soon...")
        logger.info(u"\n\n")

        num_total = self.number_total_on_queue(job_type)
        print "num_total", num_total
        num_unfinished = self.number_unfinished(job_type)
        print "num_unfinished", num_unfinished

        loop_thresholds = {"short": 30, "long": 10 * 60, "medium": 60}
        loop_unfinished = {"short": num_unfinished, "long": num_unfinished}
        loop_start_time = {"short": time(), "long": time()}

        # print_idle_dynos(job_type)

        while all(loop_unfinished.values()):
            for loop in ["short", "long"]:
                if elapsed(loop_start_time[loop]) > loop_thresholds[loop]:
                    if loop in ["short", "long"]:
                        num_unfinished_now = self.number_unfinished(job_type)
                        num_finished_this_loop = loop_unfinished[
                            loop] - num_unfinished_now
                        loop_unfinished[loop] = num_unfinished_now
                        if loop == "long":
                            logger.info(u"\n****"),
                        logger.info(
                            u"   {} finished in the last {} seconds, {} of {} are now finished ({}%).  "
                            .format(
                                num_finished_this_loop, loop_thresholds[loop],
                                num_total - num_unfinished_now, num_total,
                                int(100 *
                                    float(num_total - num_unfinished_now) /
                                    num_total))
                        ),  # comma so the next part will stay on the same line
                        if num_finished_this_loop:
                            minutes_left = float(
                                num_unfinished_now
                            ) / num_finished_this_loop * loop_thresholds[
                                loop] / 60
                            logger.info(
                                u"{} estimate: done in {} mins, which is {} hours"
                                .format(loop, round(minutes_left, 1),
                                        round(minutes_left / 60, 1)))
                        else:
                            print
                        loop_start_time[loop] = time()
                        # print_idle_dynos(job_type)
            print ".",
            sleep(3)
        logger.info(u"everything is done.  turning off all the dynos")
        self.scale_dyno(0, job_type)
Exemplo n.º 43
0
            scale_dyno(0, job_type)
        truncate(job_type)
        add_dois_to_queue_from_file(parsed_args.filename, job_type)

    if parsed_args.addall or parsed_args.where:
        if num_dynos(job_type) > 0:
            scale_dyno(0, job_type)
        add_dois_to_queue_from_query(parsed_args.where, job_type)

    if parsed_args.soup:
        if num_dynos(job_type) > 0:
            scale_dyno(0, job_type)
        if parsed_args.dynos:
            scale_dyno(parsed_args.dynos, job_type)
        else:
            logger.info(u"no number of dynos specified, so setting 1")
            scale_dyno(1, job_type)
        monitor_till_done(job_type)
        scale_dyno(0, job_type)
        export_with_versions(parsed_args.all, job_type, parsed_args.filename,
                             parsed_args.view)
    else:
        if parsed_args.dynos != None:  # to tell the difference from setting to 0
            scale_dyno(parsed_args.dynos, job_type)
            # if parsed_args.dynos > 0:
            #     print_logs(job_type)

    if parsed_args.reset:
        reset_enqueued(job_type)

    if parsed_args.status:
Exemplo n.º 44
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            with closing(
                    http_get(url,
                             stream=True,
                             related_pub=self.related_pub,
                             ask_slowly=self.ask_slowly)) as self.r:

                if self.r.status_code != 200:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                    return

                # if our url redirects to a pdf, we're done.
                # = open repo http://hdl.handle.net/2060/20140010374
                if self.is_a_pdf_page():
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"this is a PDF. success! [{}]".format(url))
                    self.scraped_pdf_url = url
                    return

                else:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"is not a PDF for {}.  continuing more checks".
                            format(url))

                # now before reading the content, bail it too large
                if is_response_too_large(self.r):
                    logger.info(u"landing page is too large, skipping")
                    return

                # get the HTML tree
                page = self.r.content

                # set the license if we can find one
                scraped_license = find_normalized_license(page)
                if scraped_license:
                    self.scraped_license = scraped_license

                # special exception for citeseer because we want the pdf link where
                # the copy is on the third party repo, not the cached link, if we can get it
                if u"citeseerx.ist.psu.edu/" in url:
                    matches = re.findall(
                        u'<h3>Download Links</h3>.*?href="(.*?)"', page,
                        re.DOTALL)
                    if matches:
                        self.scraped_pdf_url = unicode(matches[0], "utf-8")
                        self.scraped_open_metadata_url = url
                        return

                pdf_download_link = self.find_pdf_link(page)
                if pdf_download_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a PDF download link: {} {} [{}]".format(
                                pdf_download_link.href,
                                pdf_download_link.anchor, url))

                    pdf_url = get_link_target(pdf_download_link.href,
                                              self.r.url)
                    # if they are linking to a PDF, we need to follow the link to make sure it's legit
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"checking to see the PDF link actually gets a PDF [{}]"
                            .format(url))
                    if self.gets_a_pdf(pdf_download_link, self.r.url):
                        self.scraped_pdf_url = pdf_url
                        self.scraped_open_metadata_url = url
                        return

                # try this later because would rather get a pdfs
                # if they are linking to a .docx or similar, this is open.
                doc_link = find_doc_download_link(page)
                if doc_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a .doc download link {} [{}]".format(
                                get_link_target(doc_link.href, self.r.url),
                                url))
                    self.scraped_open_metadata_url = url
                    return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
Exemplo n.º 45
0
def get_new_dois_and_data_from_crossref(query_doi=None,
                                        first=None,
                                        last=None,
                                        today=False,
                                        week=False,
                                        chunk_size=1000):
    i = 0
    records_to_save = []

    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {
        "Accept": "application/json",
        "User-Agent": "mailto:[email protected]"
    }

    root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
    root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"
    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates
    # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}"
    # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
        first = (datetime.date.today() -
                 datetime.timedelta(days=7)).isoformat()
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
        first = (datetime.date.today() -
                 datetime.timedelta(days=2)).isoformat()

    if not first:
        first = "2016-04-01"

    start_time = time()

    while has_more_responses:

        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first,
                                                last=last,
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first,
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(
            elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(
                resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = add_new_pubs(pubs_this_chunk)
                    logger.info(
                        u"added {} pubs, loop done in {} seconds".format(
                            len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    # if new_pubs:
                    #     id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]]
                    #     logger.info(u"last few ids were {}".format(id_links))

                    pubs_this_chunk = []
                    loop_time = time()

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = add_new_pubs(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(
        u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
            num_pubs_added_so_far,
            datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
Exemplo n.º 46
0
    def gets_a_pdf(self, link, base_url):

        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} is a pdf".format(absolute_url))

        start = time()
        try:
            with closing(
                    http_get(absolute_url,
                             stream=True,
                             related_pub=self.related_pub,
                             ask_slowly=self.ask_slowly)) as self.r:

                if self.r.status_code != 200:
                    self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format(
                        self.r.status_code, absolute_url)
                    return False

                if self.is_a_pdf_page():
                    return True

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format(
                absolute_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format(
                absolute_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format(
                absolute_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in gets_a_pdf for {}: {}".format(
                absolute_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format(
                absolute_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format(
                absolute_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)

        if DEBUG_SCRAPING:
            logger.info(
                u"we've decided this ain't a PDF. took {} seconds [{}]".format(
                    elapsed(start), absolute_url))
        return False
Exemplo n.º 47
0
        type=str,
        help="last filename to process (example: --last 2006-01-01)")

    parser.add_argument('--query_doi',
                        nargs="?",
                        type=str,
                        help="pull in one doi")

    parser.add_argument(
        '--today',
        action="store_true",
        default=False,
        help="use if you want to pull in crossref records from last 2 days")
    parser.add_argument(
        '--week',
        action="store_true",
        default=False,
        help="use if you want to pull in crossref records from last 7 days")

    parser.add_argument('--chunk_size',
                        nargs="?",
                        type=int,
                        default=1000,
                        help="how many docs to put in each POST request")

    parsed = parser.parse_args()

    logger.info(u"calling {} with these args: {}".format(
        function.__name__, vars(parsed)))
    function(**vars(parsed))
Exemplo n.º 48
0
def Mqtt_process(Stat, MqConn,MqStatChild , MqDataChild,):
    mq = mqtt_parameters.query.get(1)
    try:
        client = mqtt.Client(client_id = "Proj_%s" %(random.getrandbits(8)))
        client.on_connect = on_connect
        # Mqtt_Stat = client._handle_connack()
        if mq.mqtt_user_name and mq.mqtt_password :
            client.username_pw_set(mq.mqtt_user_name,mq.mqtt_password)
        elif mq.mqtt_access_token:
            client.username_pw_set(mq.mqtt_access_token)

        client.connect(mq.mqtt_ip,mq.mqtt_port,60)
        MqStatChild.send("Setted Client parameters")
        logger.info("Setted Client parameters")

        client.loop_start()
        MqStatChild.send("Loop Started & Connected to Server")
        logger.info("Loop Started & Connected to Server")
        Mqtt_Stat = Stat.value
        while (Mqtt_Stat > 0):
            time.sleep(1)

            if Mqtt_Stat == 0:
                pass

            elif Mqtt_Stat == 1: #---Connection refused - incorrect protocol version ---#
                client.loop_stop()
                MqStatChild.send("Connection refused - invalid client identifier")    
                logger.error("Connection refused - invalid client identifier")  

            elif Mqtt_Stat == 2 : #---Connection refused - invalid client identifier---#
                MqStatChild.send("Connection refused - invalid client identifier")
                logger.error("Connection refused - invalid client identifier")
                client.loop_stop()
                time.sleep(1)
                client = mqtt.Client(client_id = "Proj_%s" %random.getrandbits(8))
                MqStatChild.send("Changed another Client identifier")
                logger.info("Changed another Client identifier")
                client.loop_start()
                MqStatChild.send("Loop Started")
                logger.info("Loop Started")

            elif Mqtt_Stat == 3: #-- Connection refused - server unavailable ---#
                client.loop_stop()
                MqStatChild.send("Connection Unaviable Checck Internet")
                logger.error("Connection Unaviable Checck Internet")

            elif Mqtt_Stat == 4: #---Connection refused - bad username or password---#
                client.loop_stop()
                MqStatChild.send(" Connection refused - bad username or password")
                logger.error(" Connection refused - bad username or password")
            elif Mqtt_Stat == 5 : #---Connection refused - not authorised---#
                client.loop_stop()
                MqStatChild.send("Connection refused - not authorised")
                logger.error("Connection refused - not authorised")

            else :
                MqStatChild.send("Waiting for Connection or Not Connected or -->Mqtt_Stat - %s" %Mqtt_Stat)
                logger.info("Waiting for Connection or Not Connected or -->Mqtt_Stat - %s" %Mqtt_Stat)

        while True:
            if MqConn.poll():
                msg = MqConn.recv()
                client.publish(msg[0].topic, payload= msg[1], qos=msg[0].qos, retain=msg[0].retain)
                # client.publish(msg["topic"], msg["value"])
                MqDataChild.send(msg)

    except Exception as e :
        client.loop_stop()
        print("Mqtt error - {}".format(e))
        MqStatChild.send("Mqtt Disconnected, mqtt Process Stopped")
        MqStatChild.send(str(e))
        logger.exception("Got Exception")
Exemplo n.º 49
0
    def worker_run(self, **kwargs):
        run_class = PageNew

        single_id = kwargs.get("id", None)
        chunk_size = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", None)
        scrape_publisher = kwargs.get("scrape_publisher", False)

        if limit is None:
            limit = float("inf")

        if single_id:
            page = run_class.query.filter(run_class.id == single_id).first()
            page.scrape()
            db.session.merge(page)
            safe_commit(db) or logger.info(u"COMMIT fail")
        else:
            index = 0
            num_updated = 0
            start_time = time()

            while num_updated < limit:
                new_loop_start_time = time()

                objects = self.fetch_queue_chunk(chunk_size, scrape_publisher)

                if not objects:
                    sleep(5)
                    continue

                scraped_ids = scrape_pages(objects)
                unscraped_ids = [obj.id for obj in objects if obj.id not in scraped_ids]

                logger.info(u'scraped {} pages and returned {} to the queue'.format(
                    len(scraped_ids), len(unscraped_ids)
                ))

                scraped_batch_text = u'''
                    update {queue_table}
                    set finished = now(), started=null
                    where id = any(:ids)'''.format(queue_table=self.table_name(None))

                unscraped_batch_text = u'''
                     update {queue_table}
                     set started=null
                     where id = any(:ids)'''.format(queue_table=self.table_name(None))

                scraped_batch_command = text(scraped_batch_text).bindparams(
                    ids=scraped_ids)

                unscraped_batch_command = text(unscraped_batch_text).bindparams(
                    ids=unscraped_ids)

                db.session.execute(scraped_batch_command)
                db.session.execute(unscraped_batch_command)

                commit_start_time = time()
                safe_commit(db) or logger.info(u"COMMIT fail")
                logger.info(u"commit took {} seconds".format(elapsed(commit_start_time, 2)))

                index += 1
                num_updated += chunk_size
                self.print_update(new_loop_start_time, len(scraped_ids), limit, start_time, index)
Exemplo n.º 50
0
def Mod_ReadWrite(ModConn, ModStatChild):
    mod = modbus_parameters.query.get(1)
    
    PubTopics = pub_mqtt_topics.query.filter(pub_mqtt_topics.mod_addresses.any(read_mod_registers.address >= 0)).all()

    try:
        while True:
            if is_connected(mod.modbus_ip,mod.modbus_port):
                ModStatChild.send("Modbus device Connection is UP")
                logger.info("Modbus device Connection is UP")
                break
            else:
                ModStatChild.send("Modbus device connection is  DOWN")
                logger.error("Modbus device connection is  DOWN")
                time.sleep(10)
        Modclient = ModbusClient(mod.modbus_ip, port = mod.modbus_port)
        msg = 0        
        while True :
            
            if ModStatChild.poll():
                msg = ModStatChild.recv()
                print("received Msg in modbus outer while loop {}".format(msg))
                logger.info("received Msg in modbus outer while loop {}".format(msg))
            if msg == 1:
                Modclient.connect()
                ModStatChild.send("Connected to Modbus device")
                logger.info("Connected to Modbus device")
                msg = 2
            while msg  == 2:
                # GetModValues = ModReadJson(Modclient, 0 , 10) 
                for topic in PubTopics:
                    GetModValues = ModReadTopic(Modclient,topic)
                    #########################
                    ModConn.send(GetModValues)
                    #########################
                    # print(GetModValues)
                time.sleep(0.5)
                if ModStatChild.poll():
                    msg = ModStatChild.recv()
                    if msg == 1 :
                        print("received Msg in modbus inner while loop {}".format(msg))
                        logger.info("received Msg in modbus inner while loop {}".format(msg))
                        msg = 2
                        ModStatChild.send("Modbus device data Acquisition already running")
                    
            while msg == 3 :
                Modclient.close()
                msg = 0
                ModStatChild.send("Modbus device connection Closed")
                
            # FlModChild.send("Disconnected from Controller")
        # FlModChild.send(GetModValues)
        # if ModConn.poll():
        #     msg = ModConn.recv()
        #     if isinstance(msg, dict):
        #         if "ModWrite" in msg:
        #             if msg["ModWrite"] == True:
        #                 ModWriteJson(ModbusClient,msg)
    except Exception as e:
        ModStatChild.send("Modbus Disconnected, Modbus process Stopped")
        ModStatChild.send(str(e))
        print(e)
############------------------------------------------------------################
Exemplo n.º 51
0
 def stop(self):
     self.stopped = True
     self.status = 'stopping'
     if hasattr(self.job, 'stop'):
         self.status = self.job.stop()
     logger.info('sequence job stopped: status={}'.format(self.status))
Exemplo n.º 52
0
 def unsubscribe(self, client):
     logger.info('unsubscribing client: {} (clients: {})'.format(
         client, len(self.clients)))
     self.clients = [x for x in self.clients if x != client]
     logger.debug('clients now: {}'.format(len(self.clients)))
Exemplo n.º 53
0
    def get_geojson_carto_gs(self,
                             filepath: tuple = (),
                             opts: dict = {}) -> dict:
        # [Step - 1] Get image + check color sampling
        img_path = os.path.join(BASE_DIR, filepath[0])
        img_tiff_path = os.path.join(BASE_DIR, filepath[1])
        img_extension = os.path.splitext(img_path)[1]
        img_name = ntpath.basename(img_path).replace(img_extension, '')
        img_base_path = img_path.replace(ntpath.basename(img_path), '')

        color_preset = self.data['color_presets'][self.options['color_preset']]
        logger.info('Color Preset (Carto Grayscale): ',
                    {'color_preset': color_preset})
        do_contour_normalization = bool(
            color_preset['building']['normalize_contours']
        ) if 'normalize_contours' in color_preset['building'] else False
        image = cv2.imread(img_path, 1)

        fc_bgr_building_gray = color_preset['building']['fill']['gray']
        fc_hsv_building_gray = bgr_color_to_hsv(fc_bgr_building_gray)

        if color_preset['building']['border']['type'] == 'relative':
            fc_hsv_building_gray_darker = self.transform_relative_color(
                fc_hsv_building_gray,
                color_preset['building']['border']['value']['gray'])
        else:
            fc_hsv_building_gray_darker = self.transform_color_string_to_float(
                color_preset['building']['border']['value']['gray'])

        logger.debug(
            self.logger_base_text + 'Color Info', {
                'fill_color_bgr': {
                    'gray': fc_bgr_building_gray
                },
                'fill_color_hsv': {
                    'gray': fc_hsv_building_gray
                },
                'border_color_hsv': {
                    'gray': fc_hsv_building_gray_darker
                }
            })

        # [Step-2] Do masking on HSV Image
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV)

        mask_gray = cv2.inRange(hsv, fc_hsv_building_gray,
                                fc_hsv_building_gray_darker)
        final = cv2.bitwise_or(image, image, mask=mask_gray)

        # [Step-3] Find Contours
        json_contour_filepath = self.data['file']['json_contour'].replace(
            '<result_path>', self.data['path']['result']).replace(
                '<img_name>', img_name).replace('<preset>', 'carto-gs')
        json_contour_debug_filepath = self.data['file'][
            'json_contour_debug'].replace('<result_path>',
                                          self.data['path']['result']).replace(
                                              '<img_name>', img_name).replace(
                                                  '<preset>', 'carto-gs')
        geojson_filepath = self.data['file']['geojson'].replace(
            '<result_path>', self.data['path']['result']).replace(
                '<img_name>', img_name).replace('<preset>', 'carto-gs')

        final_gray = cv2.cvtColor(final, cv2.COLOR_BGR2GRAY)
        final_blurred = cv2.GaussianBlur(final_gray, (3, 3), 0)
        ret, final_thresh = cv2.threshold(final_blurred, 127, 255, 0)

        contours, hierarchy = cv2.findContours(final_thresh, cv2.RETR_EXTERNAL,
                                               cv2.CHAIN_APPROX_SIMPLE)

        # contour normalization
        if do_contour_normalization:
            contours = self.normalize_contours(contours)

        ctr_json_str = json.dumps(
            {
                'contours': contours,
                'hierarchy': hierarchy
            },
            default=json_np_default_parser)
        ctr_json = json.loads(ctr_json_str)

        ctr_points = []
        for cidx in range(len(ctr_json['contours'])):
            ctr_points.append(
                list(map(lambda x: x[0], ctr_json['contours'][cidx])))

        # [Step - 4] Find Contours Geographic Coordinates
        geotiff_image = img_path.replace(img_extension, '.tif')
        translate_coords = GeoTiffProcessor.get_multi_polygon_axis_point_coordinates(
            geotiff_image, ctr_points, {'debug': False})

        final_coords = []
        geo_features = []
        for poly in translate_coords['coords']:
            poly_coords = []
            poly_geo_coords = []
            for cr in poly:
                poly_coords.append({
                    'x': cr['x'],
                    'y': cr['y'],
                    'latitude': cr['lat'],
                    'longitude': cr['long']
                })
                poly_geo_coords.append((cr['long'], cr['lat']))

            # add final closing point
            poly_geo_coords.append((poly[0]['long'], poly[0]['lat']))
            final_coords.append(poly_coords)
            geo_feature = Feature(
                geometry=Polygon([poly_geo_coords], precision=15))
            geo_features.append(geo_feature)

        geo_feature_collection = FeatureCollection(geo_features)
        geo_feature_collection_dump = geojson_dumps(geo_feature_collection,
                                                    sort_keys=True)

        with open(json_contour_filepath, 'w') as outfile:
            json.dump(final_coords, outfile)

        with open(geojson_filepath, 'w') as outfile:
            outfile.write(geo_feature_collection_dump)

        # [Step-5] Draw contours to original image clone
        final_wctrs = copy(image)
        for c in contours:
            cv2.drawContours(final_wctrs, [c], 0,
                             color_preset['building']['contour'], 2)

        # Build result
        polygon_len = len(ctr_points)
        r = {
            'file_path': geojson_filepath,
            'file_size':
            str(get_file_size(geojson_filepath, SIZE_UNIT.KB)) + ' KB',
            'polygon_total': polygon_len
        }
        if 'return_polygon_data' in opts and bool(opts['return_polygon_data']):
            r['geojson'] = json.loads(geo_feature_collection_dump)

        if self.options['save_result']:
            result_ftemplate = self.data['path'][
                'result'] + img_name + '-carto-gs-<fnm>' + img_extension

            self.write_image_results(
                result_ftemplate, '<fnm>',
                [('step-1-2-hsv-building-gray', fc_hsv_building_gray),
                 ('step-2-image-bgr', image), ('step-3-image-rgb', img_rgb),
                 ('step-4-0-hsv', hsv), ('step-4-1-hsv-mask-gray', mask_gray),
                 ('step-5-final', final), ('step-6-image-gray', final_gray),
                 ('step-7-final-blurred', final_blurred),
                 ('step-8-final-thresh', final_thresh),
                 ('step-9-image-final-with-contours', final_wctrs)])

        if self.options['show_result']:
            show_image_results([
                ("Step - 1-1 (HSV Gray Color)",
                 np.uint8([[fc_hsv_building_gray]])),
                ("Step - 2 (Image - BGR)", image),
                ("Step - 3 ( Image - RGB)", img_rgb),
                ("Step - 4-0 (HSV)", hsv),
                ("Step - 4-1 (HSV - Gray)", mask_gray),
                ("Step - 5 (Final)", final),
                ("Step - 6 (Final - Gray)", final_gray),
                ("Step - 7 (Final - Gray Blurred)", final_blurred),
                ("Step - 8 (Final - Gray Thresh)", final_thresh),
                ("Step - 9 (Final - with contours)", final_wctrs)
            ])

            # [Step - ending] Clean - up
            del contours, hierarchy, image, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, mask_gray, fc_hsv_building_gray
            return r
        else:
            # [Step - ending] Clean - up
            del contours, hierarchy, image, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, mask_gray, fc_hsv_building_gray
            return r
Exemplo n.º 54
0
    def run(self, server, devices, root_path, event_listener, on_update,
            index):
        filename_template_params = {
            'timestamp': lambda _: time.time(),
            'datetime': lambda _: time.strftime('%Y-%m-%dT%H:%M:%S-%Z'),
            'filter': 'no-filter',
            'filter_index': -1,
        }
        if 'filter_wheel' in devices and devices['filter_wheel']:
            filename_template_params['filter_index'], filename_template_params[
                'filter'] = devices['filter_wheel'].indi_sequence_filter_wheel(
                ).current_filter()

        upload_path = os.path.join(root_path, self.directory)
        self.save_directory = upload_path
        self.job_runner = ExposureSequenceJobRunner(
            server,
            devices['camera'].indi_sequence_camera(),
            self.exposure,
            self.count,
            upload_path,
            progress=self.progress,
            filename_template=self.filename,
            filename_template_params=filename_template_params,
            shots_pause=self.shots_pause,
            shots_group=self.shots_group,
            shots_group_pause=self.shots_group_pause)

        def on_started(job_runner):
            pass

        def on_each_started(job_runner, index):
            self.last_message = 'starting exposure {} out of {}'.format(
                index + 1, job_runner.count)
            on_update()

        def on_each_finished(job_runner, index, filename):
            self.last_message = 'finished exposure {} out of {}, saved to {}'.format(
                index + 1, job_runner.count, filename)
            on_update()

        def on_each_saved(job_runner, index, filename):
            logger.info('received file for index {}: {}'.format(
                index, filename))

            image = Image(path=filename, file_required=True)
            self.progress = job_runner.finished
            self.saved_images.append(image.id)
            main_images_db.add(image)
            on_update()

        def on_finished(job_runner):
            self.last_message = 'finished.'
            on_update()
            self.progress = job_runner.finished
            self.job_runner = None

        logger.info('Starting job runner: {}, upload_path={}'.format(
            self.job_runner, upload_path))
        self.job_runner.callbacks.add('on_started', on_started)
        self.job_runner.callbacks.add('on_each_started', on_each_started)
        self.job_runner.callbacks.add('on_each_finished', on_each_finished)
        self.job_runner.callbacks.add('on_each_saved', on_each_saved)
        self.job_runner.callbacks.add('on_finished', on_finished)
        try:
            self.job_runner.run()
        except:
            if self.job_runner:
                self.progress = self.job_runner.finished
            logger.warning('Error running exposures job')
            raise
        finally:
            self.job_runner = None
Exemplo n.º 55
0
    async def initialize(self):
        logger.info('Initializing MQTT connection')
        self.client.fast_mqtt.user_connect_handler = MQTTEventPublisher.on_connect
        self.client.fast_mqtt.client.on_disconnect = MQTTEventPublisher.on_disconnect

        await self.client.connect()
Exemplo n.º 56
0
    def get_geojson_osm(self, filepath: tuple = (), opts: dict = {}) -> dict:
        # [Step - 1] Get image + check color sampling
        img_path = os.path.join(BASE_DIR, filepath[0])
        img_tiff_path = os.path.join(BASE_DIR, filepath[1])
        img_extension = os.path.splitext(img_path)[1]
        img_name = ntpath.basename(img_path).replace(img_extension, '')
        img_base_path = img_path.replace(ntpath.basename(img_path), '')

        color_preset = self.data['color_presets'][self.options['color_preset']]
        logger.info('Color Preset (OSM): ', {'color_preset': color_preset})
        do_contour_normalization = bool(
            color_preset['building']['normalize_contours']
        ) if 'normalize_contours' in color_preset['building'] else False

        image_origin = cv2.imread(img_path, 1)
        if 'sharp_image' in color_preset['building']:
            sharp_img = self.unsharp_mask(
                image_origin, **color_preset['building']['sharp_image'])
            image_origin = copy(image_origin)

        image_new_contrast = []
        if 'adjust_contrast' in color_preset['building']:
            image = cv2.convertScaleAbs(
                image_origin,
                alpha=color_preset['building']['adjust_contrast']['alpha'],
                beta=color_preset['building']['adjust_contrast']['beta'])
            image_new_contrast = [
                cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-10),
                cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-20),
                cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-30),
                cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-50),
                cv2.convertScaleAbs(image_origin, alpha=1.0, beta=-60)
            ]
        else:
            image = copy(image_origin)

        light_brown = np.uint8([[color_preset['building']['fill']]])

        # Enhance image (ref: https://chrisalbon.com/machine_learning/preprocessing_images/enhance_contrast_of_greyscale_image/)
        # image = cv2.imread('images/plane_256x256.jpg', cv2.IMREAD_GRAYSCALE)
        # image_enhanced = cv2.equalizeHist(image)

        # Convert BGR to HSV for masking
        color_codes = []
        hsv_fill_color = cv2.cvtColor(light_brown, cv2.COLOR_BGR2HSV)
        # hsv_fill_color = cv2.cvtColor(light_brown, color_preset['building']['masking_color_mode'])

        # for index in hsv_fill_color:
        # color_codes = index[0]
        color_codes = hsv_fill_color[0][0]

        # [Step - 2] Do masking on HSV Image
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # img_rgb =copy(image)

        hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV)
        # hsv = cv2.cvtColor(img_rgb, color_preset['building']['masking_color_mode'])

        fill_color = (float(color_codes[0]), float(color_codes[1]),
                      float(color_codes[2]))

        find_border_color = []
        if color_preset['building']['border']['type'] == 'relative':
            temp = []
            for idx, bbv in enumerate(
                    color_preset['building']['border']['value'], 0):
                if bbv[0] == '+':
                    temp.append(float(color_codes[idx]) + float(bbv[1:]))
                elif bbv[0] == '-':
                    temp.append(float(color_codes[idx]) - float(bbv[1:]))
                else:
                    temp.append(float(bbv))
            border_color = tuple(temp)

            # border_color = (float(color_codes[0]) + color_preset['building']['border']['value'][0], float(color_codes[1]) + color_preset['building']['border']['value'][1], float(color_codes[2]) + color_preset['building']['border']['value'][2])
        else:
            find_border_color = cv2.cvtColor(
                np.uint8([[color_preset['building']['border']['value']]]),
                cv2.COLOR_BGR2HSV)
            # find_border_color = cv2.cvtColor(np.uint8([[color_preset['building']['border']['value']]]), color_preset['building']['masking_color_mode'])
            border_color = (float(find_border_color[0][0][0]),
                            float(find_border_color[0][0][1]),
                            float(find_border_color[0][0][2]))

        logger.debug(
            self.logger_base_text + 'Color Info', {
                'fill_color': fill_color,
                'border_color': border_color,
                'float_border_color': find_border_color,
                'hsv_fill_color_codes': color_codes,
                'hsv_fill_color': hsv_fill_color
            })

        mask = cv2.inRange(hsv, fill_color, border_color)
        final = cv2.bitwise_and(image, image, mask=mask)

        # self.data['path']['result']
        # self.data['file']['json_contour']

        json_contour_filepath = self.data['file']['json_contour'].replace(
            '<result_path>', self.data['path']['result']).replace(
                '<img_name>', img_name).replace('<preset>', 'osm')
        json_contour_debug_filepath = self.data['file'][
            'json_contour_debug'].replace('<result_path>',
                                          self.data['path']['result']).replace(
                                              '<img_name>', img_name).replace(
                                                  '<preset>', 'osm')
        geojson_filepath = self.data['file']['geojson'].replace(
            '<result_path>', self.data['path']['result']).replace(
                '<img_name>', img_name).replace('<preset>', 'osm')

        final_gray = cv2.cvtColor(final, cv2.COLOR_BGR2GRAY)
        final_blurred = cv2.GaussianBlur(final_gray, (5, 5), 0)
        ret, final_thresh = cv2.threshold(final_blurred, 127, 255, 0)
        contours, hierarchy = cv2.findContours(final_thresh, cv2.RETR_EXTERNAL,
                                               cv2.CHAIN_APPROX_SIMPLE)

        # contour normalization
        if do_contour_normalization:
            contours = self.normalize_contours(contours)

        ctr_json_str = json.dumps(
            {
                'contours': contours,
                'hierarchy': hierarchy
            },
            default=json_np_default_parser)
        ctr_json = json.loads(ctr_json_str)

        ctr_points = []
        for cidx in range(len(ctr_json['contours'])):
            ctr_points.append(
                list(map(lambda x: x[0], ctr_json['contours'][cidx])))

        # [Step - 4] Find Contours Geographic Coordinates
        geotiff_image = img_tiff_path
        translate_coords = GeoTiffProcessor.get_multi_polygon_axis_point_coordinates(
            geotiff_image, ctr_points, {'debug': False})

        final_coords = []
        geo_features = []
        for poly in translate_coords['coords']:
            poly_coords = []
            poly_geo_coords = []
            for cr in poly:
                poly_coords.append({
                    'x': cr['x'],
                    'y': cr['y'],
                    'latitude': cr['lat'],
                    'longitude': cr['long']
                })
                poly_geo_coords.append((cr['long'], cr['lat']))

            # add final closing point
            poly_geo_coords.append((poly[0]['long'], poly[0]['lat']))
            final_coords.append(poly_coords)
            geo_feature = Feature(
                geometry=Polygon([poly_geo_coords], precision=15))
            geo_features.append(geo_feature)

        geo_feature_collection = FeatureCollection(geo_features)
        geo_feature_collection_dump = geojson_dumps(geo_feature_collection,
                                                    sort_keys=True)

        with open(json_contour_filepath, 'w') as outfile:
            json.dump(final_coords, outfile)

        with open(geojson_filepath, 'w') as outfile:
            outfile.write(geo_feature_collection_dump)

        # [Step - 5] Draw contours to original image clone
        final_wctrs = copy(
            image
        )  # final_wctrs = copy(image_origin)# final_wctrs = copy(final)
        for c in contours:
            cv2.drawContours(final_wctrs, [c], 0,
                             color_preset['building']['contour'], 2)

        # Build result
        polygon_len = len(ctr_points)
        r = {
            'file_path': geojson_filepath,
            'file_size':
            str(get_file_size(geojson_filepath, SIZE_UNIT.KB)) + ' KB',
            'polygon_total': polygon_len
        }
        if 'return_polygon_data' in opts and bool(opts['return_polygon_data']):
            r['geojson'] = json.loads(geo_feature_collection_dump)

        if self.options['save_result']:
            result_ftemplate = self.data['path'][
                'result'] + img_name + '-<fnm>' + img_extension
            if 'sharp_image' in color_preset['building']:
                cv2.imwrite(
                    result_ftemplate.replace('<fnm>', 'step-0-sharpen-1'),
                    sharp_img)
            if 'adjust_contrast' in color_preset['building']:
                cv2.imwrite(
                    result_ftemplate.replace('<fnm>', 'step-0-contrast-1'),
                    image_new_contrast[0])
                cv2.imwrite(
                    result_ftemplate.replace('<fnm>', 'step-0-contrast-2'),
                    image_new_contrast[1])
                cv2.imwrite(
                    result_ftemplate.replace('<fnm>', 'step-0-contrast-3'),
                    image_new_contrast[2])
                cv2.imwrite(
                    result_ftemplate.replace('<fnm>', 'step-0-contrast-4'),
                    image_new_contrast[3])
                cv2.imwrite(
                    result_ftemplate.replace('<fnm>', 'step-0-contrast-5'),
                    image_new_contrast[4])
            cv2.imwrite(
                result_ftemplate.replace('<fnm>', 'step-1-hsv-light-color'),
                hsv_fill_color)
            cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-2-image-bgr'),
                        image)
            cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-3-image-rgb'),
                        img_rgb)
            cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-4-hsv'), hsv)
            cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-5-final'),
                        final)
            cv2.imwrite(result_ftemplate.replace('<fnm>', 'step-6-image-gray'),
                        final_gray)
            cv2.imwrite(
                result_ftemplate.replace('<fnm>', 'step-7-final-blurred'),
                final_blurred)
            cv2.imwrite(
                result_ftemplate.replace('<fnm>', 'step-8-final-thresh'),
                final_thresh)
            cv2.imwrite(
                result_ftemplate.replace('<fnm>',
                                         'step-9-image-final-with-contours'),
                final_wctrs)

        if self.options['show_result']:
            cv2.imshow("Step - 1 (HSV Light Color)", hsv_fill_color)
            cv2.imshow("Step - 2 (Image - BGR)", image)
            cv2.imshow("Step - 3 ( Image - RGB)", img_rgb)
            cv2.imshow("Step - 4 (HSV)", hsv)
            cv2.imshow("Step - 5 (Final)", final)
            cv2.imshow("Step - 6 (Final - Gray)", final_gray)
            cv2.imshow("Step - 7 (Final - Gray Blurred)", final_blurred)
            cv2.imshow("Step - 8 (Final - Gray Thresh)", final_thresh)
            cv2.imshow("Step - 9 (Final - with contours)", final_wctrs)
            # cv2.imshow("Step - 10 (Final - with shape contours)", final_shape_ctrs)
            cv2.waitKey(0)
            cv2.destroyAllWindows()

            # [Step - ending] Clean - up
            del contours, hierarchy, image, hsv_fill_color, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, ctr_json, ctr_json_str, final_coords, geo_features, ctr_points
            return r
        else:
            # [Step - ending] Clean - up
            del contours, hierarchy, image, hsv_fill_color, img_rgb, hsv, final, final_gray, final_wctrs, final_blurred, final_thresh, ctr_json, ctr_json_str, final_coords, geo_features, ctr_points
            return r
Exemplo n.º 57
0
 def __init__(self, mqtt_host: str, mqtt_port: int,
              mqtt_client_name: str, app: FastAPI):
     logger.info(f'simulating a client to {mqtt_host}')
     self.mqtt_client_name = mqtt_client_name
     self.mqtt_host = mqtt_host
     self.mqtt_port = mqtt_port
Exemplo n.º 58
0
 def publish(self, topic, message):
     logger.info(f'simulated publishing to {topic}. message: {message}')
Exemplo n.º 59
0
 async def publish_state(self, customer_state: CustomerState):
     message = self.prepare_payload(customer_state)
     logger.info(f'Publishing {message}')
Exemplo n.º 60
0
    def scrape_green(self):
        # handle these special cases, where we compute the pdf rather than looking for it
        if "oai:arXiv.org" in self.pmh_id:
            self.scrape_metadata_url = self.url
            self.scrape_pdf_url = self.url.replace("abs", "pdf")

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # https://ink.library.smu.edu.sg/do/oai/
        if self.endpoint and self.endpoint.id == 'ys9xnlw27yogrfsecedx' and u'ink.library.smu.edu.sg' in self.url:
            if u'viewcontent.cgi?' in self.url:
                return
            if self.pmh_record and find_normalized_license(self.pmh_record.license):
                self.scrape_metadata_url = self.url
                self.set_version_and_license()
                return

        if not self.scrape_pdf_url or not self.scrape_version:
            with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage:
                if not self.scrape_pdf_url:
                    my_webpage.scrape_for_fulltext_link()
                    self.error += my_webpage.error
                    if my_webpage.is_open:
                        logger.info(u"** found an open copy! {}".format(my_webpage.fulltext_url))
                        self.scrape_updated = datetime.datetime.utcnow().isoformat()
                        self.scrape_metadata_url = self.url
                        if my_webpage.scraped_pdf_url:
                            self.scrape_pdf_url = my_webpage.scraped_pdf_url
                        if my_webpage.scraped_open_metadata_url:
                            self.scrape_metadata_url = my_webpage.scraped_open_metadata_url
                        if my_webpage.scraped_license:
                            self.scrape_license = my_webpage.scraped_license
                        if my_webpage.scraped_version:
                            self.scrape_version = my_webpage.scraped_version
                if self.scrape_pdf_url and not self.scrape_version:
                    self.set_version_and_license(r=my_webpage.r)

        if self.scrape_pdf_url and not self.scrape_version:
            with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage:
                my_webpage.set_r_for_pdf()
                self.set_version_and_license(r=my_webpage.r)

        if self.is_open and not self.scrape_version:
            self.scrape_version = self.default_version()

        # associate certain landing page URLs with PDFs
        # https://repository.uantwerpen.be
        if self.endpoint and self.endpoint.id == 'mmv3envg3kaaztya9tmo':
            if self.scrape_pdf_url and self.scrape_pdf_url == self.scrape_metadata_url and self.pmh_record:
                logger.info(u'looking for landing page for {}'.format(self.scrape_pdf_url))
                landing_urls = [u for u in self.pmh_record.urls if u'hdl.handle.net' in u]
                if len(landing_urls) == 1:
                    logger.info(u'trying landing page {}'.format(landing_urls[0]))

                    try:
                        if http_get(landing_urls[0]).status_code == 200:
                            self.scrape_metadata_url = landing_urls[0]
                    except:
                        pass

                    if self.scrape_metadata_url:
                        logger.info(u'set landing page {}'.format(self.scrape_metadata_url))

        # https://lirias.kuleuven.be
        if (self.endpoint
            and self.endpoint.id == 'ycf3gzxeiyuw3jqwjmx3'
            and self.scrape_pdf_url == self.scrape_metadata_url
            and self.scrape_pdf_url and 'lirias.kuleuven.be' in self.scrape_pdf_url
        ):
            if self.pmh_record and self.pmh_record.bare_pmh_id and 'oai:lirias2repo.kuleuven.be:' in self.pmh_record.bare_pmh_id:
                self.scrape_metadata_url = 'https://lirias.kuleuven.be/handle/{}'.format(
                    self.pmh_record.bare_pmh_id.replace('oai:lirias2repo.kuleuven.be:', '')
                )