Пример #1
0
    def get_pmh_input_record(self, first, last):
        args = {}
        args['metadataPrefix'] = 'oai_dc'
        pmh_records = []
        error = None

        my_sickle = self.get_my_sickle(self.pmh_url)
        logger.info(u"connected to sickle with {}".format(self.pmh_url))

        args['from'] = first.isoformat()[0:10]
        if last:
            args["until"] = last.isoformat()[0:10]

        if self.pmh_set:
            args["set"] = self.pmh_set

        logger.info(u"calling ListRecords with {} {}".format(
            self.pmh_url, args))
        try:
            pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args)
            # logger.info(u"got pmh_records with {} {}".format(self.pmh_url, args))
            pmh_input_record = self.safe_get_next_record(pmh_records)
        except NoRecordsMatch as e:
            logger.info(u"no records with {} {}".format(self.pmh_url, args))
            pmh_input_record = None
        except Exception as e:
            logger.exception(u"error with {} {}".format(self.pmh_url, args))
            pmh_input_record = None
            error = "error in get_pmh_input_record: {} {} calling {}".format(
                e.__class__.__name__,
                unicode(e.message).encode("utf-8"),
                my_sickle.get_http_response_url())
            print error

        return (pmh_input_record, pmh_records, error)
Пример #2
0
def save_vote():
    # Get the team and time the vote was cast.
    team = request.form["team"]
    time_cast = datetime.datetime.utcnow()
    # Verify that the team is one of the allowed options
    if team != "TABS" and team != "SPACES":
        logger.warning(team)
        return Response(response="Invalid team specified.", status=400)

    stmt = sqlalchemy.text(
        "INSERT INTO votes (time_cast, candidate)" " VALUES (:time_cast, :candidate)"
    )
    try:
        with db.connect() as conn:
            conn.execute(stmt, time_cast=time_cast, candidate=team)
    except Exception as e:
        logger.exception(e)
        return Response(
            status=500,
            response="Unable to successfully cast vote! Please check the "
            "application logs for more details.",
        )

    return Response(
        status=200,
        response="Vote successfully cast for '{}' at time {}!".format(team, time_cast),
    )
Пример #3
0
async def crawl(crawl_requests):
    async for crawl_request in crawl_requests:
        print(f'Receiving Request: {crawl_request.url}')
        visit_id = (uuid.uuid4().int & (1 << 53) - 1) - 2**52
        driver, logs = await app.loop.run_in_executor(
            thread_pool,
            partial(
                get_driver,
                visit_id=visit_id,
                crawl_id=crawl_request.crawl_id,
                ws_port=WS_PORT,
            ))
        for log in logs:
            logger.info(log)
        success, failure_type, message, exceptions = await app.loop.run_in_executor(
            thread_pool, partial(do_crawl,
                                 driver=driver,
                                 url=crawl_request.url))
        print(f'Finishing Request: {crawl_request.url}')
        for e in exceptions:
            logger.exception(e)
        result = CrawlResult(
            request_id=crawl_request.request_id,
            visit_id=visit_id,
            url=crawl_request.url,
            success=success,
            time_stamp=str(datetime.datetime.now(pytz.utc)),
            failure_type=failure_type,
            message=message,
        )
        await crawl_result_topic.send(value=result)
Пример #4
0
def _handle_dialog_remove_started(request):
    """
    Check if the provided ticker is valid and stock is in watchlist, if yes, ask for confirmation. Otherwise, inform about the state.
    :type request AlexaRequest
    """
    logger.debug("dialogState STARTED")
    user_id = request.user_id()

    # Check if ticker is provided
    try:
        ticker = _check_valid_ticker_provided(request)
    except AttributeError as e:
        logger.exception("No valid ticker provided")
        message = strings.INTENT_REMOVE_FROM_WATCHLIST_FAIL
        return ResponseBuilder.create_response(request, message=message) \
            .with_reprompt(strings.INTENT_GENERAL_REPROMPT)

    # Check if stock is in users Watchlist
    is_in_watchlist = Watchlist.ticker_in_watchlist_exists(user_id, ticker)

    # Inform that stock not in watchlist, or ask user to confirm ticker remove
    if is_in_watchlist:
        logger.debug(
            f"Ask confirmation: remove stock {ticker} from user:{user_id} watchlist"
        )
        message = strings.INTENT_REMOVE_FROM_WATCHLIST_ASK_CONFIRMATION \
            .format(ticker)
        return ResponseBuilder.create_response(request, message) \
            .with_dialog_confirm_intent()
    else:
        logger.debug(
            f"Trying to remove stock {ticker}, which is not in wathclist")
        message = strings.INTENT_REMOVE_FROM_WATCHLIST_NOT_THERE.format(ticker)
        return ResponseBuilder.create_response(request, message)
Пример #5
0
 def run(self):
     while 1:
         try:
             if app.config['IS_QUIT']:
                 break
             p, request, que, imgpath = app.config['RECGQUE'].get(timeout=1)
         except Queue.Empty:
             pass
         except Exception as e:
             logger.error(e)
             time.sleep(1)
         else:
             try:
                 carinfo = self.cre.imgrecg(imgpath, request['coord'])
                 if carinfo is None:
                     result = None
                     logger.error('Recognise Error')
                 elif carinfo['head']['code'] == 0:
                     result = None
                 else:
                     result = carinfo['body']
             except Exception as e:
                 logger.exception(e)
                 result = None
             try:
                 que.put(result)
             except Exception as e:
                 logger.error(e)
Пример #6
0
def reset(token):

    try:
        user = User.query.filter_by(token=token).first()
    except Exception as err:
        logger.exception(err)
        flash("can not get user by token = {} from database".format(token), "negative")
        return redirect(url_for("userfe.reset", token=token))

    if not user:
        abort(404)

    form = FormReset()
    if form.validate_on_submit():

        if user:
            try:
                user.password = form.password.data.strip()
                user.token = None
                db.session.commit()
            except Exception as err:
                logger.exception(err)
                flash("can not reset password at this moment", "negative")
                return redirect(url_for("userfe.reset", token=token))

            flash("Your password has been reset, you can log in.", "positive")
            return redirect(url_for("userfe.login"))

    return render_template("frontend/user/reset.html", form=form, token=token)
Пример #7
0
    def post(self):
        post_data = request.get_json()
        name = post_data.get("firstName")
        surname = post_data.get("lastName")
        email = post_data.get("email")
        password = post_data.get("password")

        user = get_user_by_email(email)
        if user:
            auth_namespace.abort(400, "Sorry. That email already exists.")

        user = User.add_user(name, surname, email, password)

        # msg = Message("Testing email", recipients=[email])
        # mail.send(msg)
        try:
            with open('./templates/registration_email.html') as file:
                template = file.read()
            guard.send_registration_email(email,
                                          user=user,
                                          confirmation_sender='SchabCoin',
                                          template=template)
        except Exception as e:
            logger.exception(e)
            return "Failed to send registration email", 500

        ret = {
            'message': f'successfully sent registration email to user {email}'
        }
        return ret, 200
Пример #8
0
def update_sector_analysis(sector_names):
    if not sector_names:
        return {}, [], [], []
    try:
        sector_dict = {}
        for s in sector_names:
            sector_data = get_sector_data(s)
            for ticker in sector_data:
                sector_data[ticker]['advanced-stats']['sector'] = s
            sector_dict.update(sector_data)
        sector_df = pd.DataFrame.from_dict(
            {s: sector_dict[s]['advanced-stats']
             for s in sector_dict},
            orient='index')
        xfilter_options = [{
            'label': i,
            'value': i
        } for i in list(sector_df.columns) + [
            'EBITDAToEV(%)', 'EBITDAToRevenueMargin', 'TotalAssets',
            'EBITDAToAssets(%)'
        ]]
        company_options = [{
            'label': c,
            'value': c
        } for c in list(sector_df.companyName)]
        return sector_dict, xfilter_options, xfilter_options, company_options
    except Exception as e:
        logger.exception(e)
        return {}, [], [], []
Пример #9
0
    def set_identify_and_initial_query(self):
        if not self.pmh_url:
            self.harvest_identify_response = u"error, no pmh_url given"
            return

        try:
            # set timeout quick... if it can't do this quickly, won't be good for harvesting
            logger.debug(u"getting my_sickle for {}".format(self))
            my_sickle = self.get_my_sickle(self.pmh_url, timeout=10)
            data = my_sickle.Identify()
            self.harvest_identify_response = "SUCCESS!"

        except Exception as e:
            logger.exception(u"in set_identify_and_initial_query")
            self.error = u"error in calling identify: {} {}".format(
                e.__class__.__name__,
                unicode(e.message).encode("utf-8"))
            if my_sickle:
                self.error += u" calling {}".format(
                    my_sickle.get_http_response_url())

            self.harvest_identify_response = self.error

        last = datetime.datetime.utcnow()
        first = last - datetime.timedelta(days=30)
        self.sample_pmh_record = None
        (pmh_input_record, pmh_records,
         error) = self.get_pmh_input_record(first, last)
        if error:
            self.harvest_test_recent_dates = error
        elif pmh_input_record:
            self.harvest_test_recent_dates = "SUCCESS!"
            self.sample_pmh_record = json.dumps(pmh_input_record.metadata)
        else:
            self.harvest_test_recent_dates = "error, no pmh_input_records returned"
Пример #10
0
def update_graph(df_dict, column_name, ticker):
    if not df_dict:
        return {}
    try:
        df_str_format = pd.DataFrame.from_dict(
            df_dict[ticker]['fin_report_dict'])
        df = pd.concat([
            df_str_format.iloc[:, 0],
            df_str_format.iloc[:, 1:].applymap(get_number_from_string)
        ],
                       axis=1)
        for col in list(df.columns):
            if '%' in col:  # scale up ratio by 100 if unit is %
                df.loc[:, col] *= 100
        fig = px.line(df, x='index', y=column_name, line_shape='spline')
        fig.update_traces(mode='lines+markers')
        fig.update_layout(
            title=ticker +
            ": Past Performance is not a guarantee of Future Returns",
            xaxis_title="Year",
            yaxis_title="Value ($ or Ratio or %)",
            legend_title="Parameter(s)")
        return fig
    except Exception as e:
        logger.exception(e)
        return {}
Пример #11
0
def check_ticker_validity(ticker):
    try:
        if not ticker:
            raise ValueError(
                "Ticker Value is Empty, please Type Ticker, press Enter or Tab to continue analysis."
            )
        ticker_allcaps = ticker.upper()
        if ticker_allcaps in ticker_dict(
        ):  # Validate with https://sandbox.iexapis.com/stable/ref-data/symbols?token=
            is_valid_ticker = True
            return is_valid_ticker, not is_valid_ticker, 'Getting financial data... for: ' + ticker_dict(
            )[ticker_allcaps], [{
                'status-info': 'Market Price used in Calculation: ',
                'supp-data': ''
            }]
        else:
            raise ValueError("Invalid Ticker entered: " + ticker +
                             '\nValid Tickers from listed Exchanges:\n' +
                             '\n'.join(exchange_list()))
    except Exception as InvalidTicker:
        # dbc.Alert(
        #     str(InvalidTicker),
        #     id="alert-invalid-ticker",
        #     dismissable=True,
        #     is_open=True,
        # )
        logger.exception(InvalidTicker)
        return False, True, '', handler_data_message(
            'See Error Message(s) below:', traceback.format_exc())
Пример #12
0
def grow_followers_worker(follow_bot, unfollow_bot):
    try:
        user_followings = unfollow_bot.API.user_following(
            unfollow_bot.API.authenticated_user_id,
            rank_token=unfollow_bot.API.generate_uuid()).get("users")

        followings = len(user_followings)
    except Exception as e:
        print(e)
        followings = 0

    print("Total following:", followings)

    if followings > 6000:
        bot1 = unfollow_bot
        bot2 = follow_bot
    else:
        bot1 = follow_bot
        bot2 = unfollow_bot

    while True:
        try:
            with app.app_context():
                bot1.start()
        except Exception as e:
            logger.exception("Unfollow failed to start")
        try:
            with app.app_context():
                bot2.start()
        except Exception as e:
            logger.exception("Follow failed to start")
Пример #13
0
def rekapRegister(cid):
    register = common._useMysql("rekap_reg")
    siswas = api.all_santri()
    try:
        temps = []
        siswa = [i for i in siswas if i.get("nik") in (a[0] for a in register)]
        temps.append([
            "NIK", "NAMA LENGKAP", "BLOK", "KAMAR", "LEMBAGA", "KELAS",
            "JURUSAN"
        ])
        for i in siswa:
            data = (
                i.get("nik"),
                i.get("nama"),
                i.get("blok"),
                i.get("kamar"),
                i.get("lembaga") or "-",
                i.get("kelas") or "-",
                "{} {}".format(
                    i.get("jurusan") or "-",
                    i.get("rombel") or "-"),
            )
            temps.append(data)
        cetakExcel(cid, temps)
    except Exception as e:
        logger.exception(e)
Пример #14
0
    def gets_a_word_doc(self, link, base_url):
        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(
                u"checking to see if {} is a word doc".format(absolute_url))

        start = time()
        try:
            r = http_get(absolute_url,
                         stream=True,
                         publisher=self.publisher,
                         session_id=self.session_id,
                         ask_slowly=self.ask_slowly)

            if r.status_code != 200:
                return False

            if is_a_word_doc(r):
                return True

        except Exception as e:
            logger.exception(u'error in gets_a_word_doc: {}'.format(e))

        return False
Пример #15
0
    def set_identify_and_initial_query(self):
        if not self.pmh_url:
            self.harvest_identify_response = u"error, no pmh_url given"
            return

        my_sickle = None
        try:
            # set timeout quick... if it can't do this quickly, won't be good for harvesting
            logger.debug(u"getting my_sickle for {}".format(self))
            my_sickle = _get_my_sickle(self.pmh_url, timeout=10)
            my_sickle.Identify()
            self.harvest_identify_response = "SUCCESS!"

        except Exception as e:
            logger.exception(u"in set_identify_and_initial_query")
            self.error = u"error in calling identify: {} {}".format(
                e.__class__.__name__, unicode(e.message).encode("utf-8"))
            if my_sickle:
                self.error += u" calling {}".format(my_sickle.get_http_response_url())

            self.harvest_identify_response = self.error

        self.sample_pmh_record = None

        try:
            sample_pmh_record = self.get_recent_pmh_record()
            if sample_pmh_record:
                self.harvest_test_recent_dates = "SUCCESS!"
                self.sample_pmh_record = json.dumps(sample_pmh_record.metadata)
            else:
                self.harvest_test_recent_dates = "error, no pmh_input_records returned"
        except Exception as e:
            self.error = u"error in get_recent_pmh_record: {} {}".format(
                e.__class__.__name__, unicode(e.message).encode("utf-8"))
            self.harvest_test_recent_dates = self.error
Пример #16
0
def translate_1_2():
    """
    :param sl: source language
    :type sl: string
    :param tl: target language
    :type tl: string
    :param m: mode ( 1 for normal, 2 for better )
    :type m: int
    :param t: text to be translated
    :type t: string

    Translates given text.
    """
    keys = ('t', 'm', 'sl', 'tl')
    text, mode, source, target = map(lambda k: request.form[k].strip(), keys)

    try:
        payload = translate(text, mode, source, target)

        return jsonify(payload)

    except HTTPException as e:
        return e.message, e.status_code

    except Exception as e:
        logger.exception(e)
        return str(e), 500
Пример #17
0
    def post(self):
        """Add a new offer"""
        logger.info("Offers.post() request_body: %s", str(request.get_json()))
        try:
            content = json.loads(request.form['data'])
            user_id = current_user().id
            photo = request.files.get('photo', None)
            photo_url = cloudinary_uploader.upload(
                photo)['url'] if photo else None

            for parameter in [
                    'name', 'portions_number', 'longitude', 'latitude',
                    'pickup_times', 'offer_expiry'
            ]:
                if parameter not in content:
                    return f"{parameter} missing in request", 400

            offer_id = Offer.add_offer(
                user_id, content['name'], True, content['portions_number'], 0,
                content['longitude'], content['latitude'], datetime.now(),
                content['pickup_times'], content['offer_expiry'],
                content.get('description', None), photo_url)

            for tag_id in content.get('tags', []):
                OffersTags.add_offer_tag(offer_id, tag_id)
            return "Offer has been added", 201

        except Exception as e:
            logger.exception("Offers.post(): %s", str(e))
            return "Couldn't add offers", 500
Пример #18
0
def _handle_dialog_add_started(request):
    """
    Check if the provided ticker is supported or is not already in watchlist, if not, ask for confirmation.
    :type request AlexaRequest
    """
    print("LOG-d: dialogState STARTED")

    # Check if ticker is provided
    try:
        ticker = _check_valid_ticker_provided(request)
    except AttributeError as e:
        logger.exception("No valid ticker provided")
        message = strings.INTENT_ADDED_TO_WATCHLIST_FAIL
        return ResponseBuilder.create_response(request, message=message) \
            .with_reprompt(strings.INTENT_GENERAL_REPROMPT)

    # Ask user to confirm ticker add
    message = strings.INTENT_ADD_TO_WATCHLIST_ASK_CONFIRMATION.format(ticker)

    # Check if ticker not already in Watchlist
    user_id = request.get_user_id()
    watchlist_tickers = Watchlist.get_users_tickers(user_id)
    for ticker_in_watchlist in watchlist_tickers:
        if ticker == ticker_in_watchlist:
            message = strings.INTENT_ADDED_TO_WATCHLIST_EXISTS.format(ticker)

    return ResponseBuilder.create_response(request, message) \
        .with_dialog_confirm_intent()
Пример #19
0
 def run(self):
     while 1:
         try:
             if app.config["IS_QUIT"]:
                 break
             p, request, que, imgpath = app.config["RECGQUE"].get(timeout=1)
         except Queue.Empty:
             pass
         except Exception as e:
             logger.error(e)
             time.sleep(1)
         else:
             try:
                 carinfo = self.cre.imgrecg(imgpath, request["coord"])
                 if carinfo is None:
                     result = None
                     logger.error("Recognise Error")
                 elif carinfo["head"]["code"] == 0:
                     result = None
                 else:
                     result = carinfo["body"]
             except Exception as e:
                 logger.exception(e)
                 result = None
             try:
                 que.put(result)
             except Exception as e:
                 logger.error(e)
Пример #20
0
async def cookie2user(cookie_str):
    if not cookie_str:
        return None
    try:
        L = cookie_str.split('-')
        if len(L) != 3:
            # 如果不是3个元素的话,与我们当初构造sha1字符串不符,返回None
            return None
        uid, expires, sha1 = L
        # 分别获取到用户id, 过期时间和sha1字符串
        if int(expires) < time.time():
            # 如果超时(超过一天),返回None
            return None
        user = await User.find(uid)
        # 根据用户id(id为primary key)查找库,对比有没有该用户
        if user is None:
            return None
        s = '%s-%s-%s-%s' % (uid, user.passwd, expires, _COOKIE_KEY)
        # 根据查到的user的数据构造一个校验sha1字符串
        if sha1 != hashlib.sha1(s.encode('utf-8')).hexdigest():
            logger.info('invalid sha1')
            return None
        user.passwd = '*******'
        return user
    except Exception as e:
        logger.exception(e)
        return None
Пример #21
0
    def set_r_for_pdf(self):
        self.r = None
        try:
            self.r = http_get(url=self.scraped_pdf_url, stream=False, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in set_r_for_pdf"
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except Exception as e:
            self.error += u"ERROR: Exception error in set_r_for_pdf"
            logger.exception(self.error)
Пример #22
0
def translate_1_2():
    """
    :param sl: source language
    :type sl: string
    :param tl: target language
    :type tl: string
    :param m: mode ( 1 for normal, 2 for better )
    :type m: int
    :param t: text to be translated
    :type t: string

    Translates given text.
    """
    keys = ('t', 'm', 'sl', 'tl')
    text, mode, source, target = map(lambda k: request.form[k].strip(), keys)

    try:
        payload = translate(text, mode, source, target)

        return jsonify(payload)

    except HTTPException as e:
        return e.message, e.status_code

    except Exception as e:
        logger.exception(e)
        return str(e), 500
Пример #23
0
def _insertAbsen(data):
    conn = MySQLdb.connect(host=HOST, user=USER, passwd=PASS, db=DB)
    logger.info(data)
    try:
        cur = conn.cursor()
        book = xlrd.open_workbook('{}'.format(data))
        sheet = book.sheet_by_index(0)
        jum = _useMysql("id_terakhir")[0]
        no = jum + 1
        sql = 'insert into absen (id, nik, ubudiyah, alquran, belajar, sekolah, diniyah, bulan) values (%s,%s,%s,%s,%s,%s,%s,%s)'
        for r in range(1, sheet.nrows):
            nik = sheet.cell(r, 0).value
            ubudiyah = sheet.cell(r, 1).value
            alquran = sheet.cell(r, 2).value
            belajar = sheet.cell(r, 3).value
            sekolah = sheet.cell(r, 4).value
            diniyah = sheet.cell(r, 5).value
            bulan = sheet.cell(r, 6).value

            text = (no, nik, ubudiyah, alquran, belajar, sekolah, diniyah,
                    bulan)
            cur.execute(sql, text)
            no += 1
        conn.commit()
        cur.close()
        conn.close()
        os.remove('{}'.format(data))
        return "Sudah Di Insert Mbak Absennya, Senyum Dong :p"
    except Exception as e:
        conn.rollback()
        cur.close()
        conn.close()
        logger.exception(e)
        return e
Пример #24
0
 def safe_get_next_record(self, current_record, tries=3):
     self.error = None
     try:
         next_record = next(current_record)
     except (requests.exceptions.HTTPError,
             requests.exceptions.SSLError) as e:
         if tries > 0:
             logger.info("requests exception! trying again {}".format(e))
             return self.safe_get_next_record(current_record, tries - 1)
         else:
             logger.info("requests exception! skipping {}".format(e))
             self.error = "requests error in safe_get_next_record; try again"
             return None
     except (KeyboardInterrupt, SystemExit):
         # done
         return None
     except StopIteration:
         logger.info("stop iteration! stopping")
         return None
     except NoRecordsMatch:
         logger.info("no records! stopping")
         return None
     except Exception as e:
         logger.exception("misc exception!: {}  skipping".format(e))
         self.error = "error in safe_get_next_record"
         return None
     return next_record
Пример #25
0
    def set_identify_and_initial_query(self):
        if not self.pmh_url:
            self.harvest_identify_response = u"error, no pmh_url given"
            return

        try:
            # set timeout quick... if it can't do this quickly, won't be good for harvesting
            logger.debug(u"getting my_sickle for {}".format(self))
            my_sickle = self.get_my_sickle(self.pmh_url, timeout=10)
            data = my_sickle.Identify()
            self.harvest_identify_response = "SUCCESS!"

        except Exception as e:
            logger.exception(u"in set_identify_and_initial_query")
            self.error = u"error in calling identify: {} {}".format(
                e.__class__.__name__, unicode(e.message).encode("utf-8"))
            if my_sickle:
                self.error += u" calling {}".format(my_sickle.get_http_response_url())

            self.harvest_identify_response = self.error

        last = datetime.datetime.utcnow()
        first = last - datetime.timedelta(days=30)
        self.sample_pmh_record = None
        (pmh_input_record, pmh_records, error) = self.get_pmh_input_record(first, last)
        if error:
            self.harvest_test_recent_dates = error
        elif pmh_input_record:
            self.harvest_test_recent_dates = "SUCCESS!"
            self.sample_pmh_record = json.dumps(pmh_input_record.metadata)
        else:
            self.harvest_test_recent_dates = "error, no pmh_input_records returned"
Пример #26
0
    def set_r_for_pdf(self):
        self.r = None
        try:
            self.r = http_get(url=self.scraped_pdf_url, stream=False, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in set_r_for_pdf"
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except Exception as e:
            self.error += u"ERROR: Exception error in set_r_for_pdf"
            logger.exception(self.error)
Пример #27
0
	def apply(self):
		logger.info(f'Processing action ({self.__class__.__name__}) for ({self.isp.profile.email})...')
		driver = self.isp.driver
		profile = self.isp.profile

		# print('Start ActionChains...')
		# Go to spam section.
		driver.get('https://mail.yahoo.com/d/folders/6')

		# let javascript requests finish.
		time.sleep(5)

		# Scroll down.
		with utils.scroll_down(driver, 'div[data-test-id=virtual-list]', ignored_exceptions=(JavascriptException,)):
			time.sleep(2)

			total_messages = self.isp.get_total_messages()

			if not isinstance(total_messages, int):
				# set a default value or exit.
				total_messages = 0

			actions = ActionChains(driver)
			# Archive all messages.
			try:
				# scroll top to open the first message.
				with utils.scroll_up(driver, 'div[data-test-id=virtual-list]', ignored_exceptions=(JavascriptException,)):
					messages = driver.find_elements_by_css_selector('a[data-test-id=message-list-item]')
					messages[0].click()
					# get the amount of messages to open.
					last_message = common.get_amount_of_message(total_messages)
					click.secho(f'({profile.email}) Total messages {total_messages}: {last_message} messages will be openned.', fg='bright_black')

					with click.progressbar(length=last_message, label=f'Openning messages ({profile.email})...', show_pos=True) as bar:
						for i in range(last_message):
							actions = ActionChains(driver)
							actions.send_keys(Keys.ARROW_RIGHT)
							# add start to the current message.
							if random.random() <= app_settings.MESSAGES_STARTS_RATIO:
								actions.send_keys('l')
							actions.perform()

							# show the progress
							# print(f'\r{i+1}/{last_message}', end='')

							bar.update(1) # +=1 each time

							# clear the all chained actions (is not working, it's a bug in selenium source code).
							# actions.reset_actions()

							time.sleep(random.uniform(3, 5))


			except TimeoutException:
				logger.warning(f'({self.ACTION.name}) {profile.email:.<40} [WARNING]')
			except Exception as e:
				logger.exception(f'[{self.ACTION.name}] {profile.email:.<40} [Error]')
			else:
				logger.info(f'({self.ACTION.name}) {profile.email:.<40} [DONE]')
Пример #28
0
def get_user_id(user_name):
    logger.debug("In getting user id")
    try:
        user = mm_client.users.get_user_by_username(user_name)
        return True, user["id"]
    except Exception as e:
        logger.exception("Exception in getting user id from mattermost server")
        return False, None
Пример #29
0
def download_image(link):
    try:
        res = requests.get(link)
        res.raise_for_status()
        return True, res.content
    except Exception as e:
        logger.exception("Exception in downloading image")
        return False, None
Пример #30
0
def do_get_responses():
    page = get_responses(invalid=False)

    try:
        return jsonify([item.to_dict() for item in page.items])
    except AttributeError:
        logger.exception("No items in page")
        return jsonify({}), 404
Пример #31
0
 def save(self):
     try:
         db.session.merge(self)
         db.session.commit()
     except OperationalError as e:
         logger.exception(
             "Can't connect to MySQL server ottobotdb.clccaawfuuph.eu-central-1.rds.amazonaws.com"
         )
Пример #32
0
 def safedispatch(self, environ, start_response):
     try:
         return self.appdispatch(environ, start_response)
     except: 
         if self.debug:
             raise
         logger.exception("Exception")
         return Response("Fejlsidens fejlside.")(environ, start_response)
Пример #33
0
 def create(cls, **kwargs):
     try:
         obj = cls(**kwargs)
         db.session.add(obj)
         db.session.commit()
         return obj
     except:
         logger.exception('Failed to create FakeMixn for %s with %s', cls.__name__, kwargs)
Пример #34
0
def do_get_responses():
    page = get_responses(invalid=False)

    try:
        return jsonify([item.to_dict() for item in page.items])
    except AttributeError:
        logger.exception("No items in page")
        return jsonify({}), 404
Пример #35
0
def _call_biz(url, max_cnt):
    try:
        res = requests.get(url, params=dict(request.args, max_cnt=max_cnt))
        assert res.status_code == 200, "status_code is: {}, not 200!".format(res.status_code)
        return res.text
    except Exception as e:
        logger.exception(e)
        return 'error: {}'.format(e)
Пример #36
0
def feedback():
    try:
        with open(driver.feedback_path, 'a', encoding='utf-8') as f:
            f.write(json.dumps(request.form, ensure_ascii=False) + '\n')
        return 'OK'
    except Exception as e:
        logger.exception(e)
        return 'ERROR'
Пример #37
0
    def set_version_and_license(self, r=None):
        self.updated = datetime.datetime.utcnow().isoformat()

        if self.is_pmc:
            self.set_info_for_pmc_page()
            return

        # set as default
        self.scrape_version = "submittedVersion"

        is_updated = self.update_with_local_info()

        # now try to see what we can get out of the pdf itself

        if not r:
            logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
            return

        try:
            # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf
            if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE):
                self.scrape_version = "publishedVersion"

            text = convert_pdf_to_txt(r, max_pages=25)

            # logger.info(text)

            if text and self.scrape_version == "submittedVersion":
                patterns = [
                    re.compile(ur"©.?\d{4}", re.UNICODE),
                    re.compile(ur"\(C\).?\d{4}", re.IGNORECASE),
                    re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE),
                    re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"all rights reserved", re.IGNORECASE),
                    re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL),
                    re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL)
                    ]

                for pattern in patterns:
                    if pattern.findall(text):
                        logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern))
                        self.scrape_version = "publishedVersion"

            if not self.scrape_license:
                open_license = find_normalized_license(text)
                if open_license:
                    logger.info(u'found license in PDF: {}'.format(open_license))
                    self.scrape_license = open_license

        except Exception as e:
            logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url))
            self.error += u"Exception doing convert_pdf_to_txt!"
            logger.info(self.error)

        logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
Пример #38
0
def run_fetch():
    while True:
        if market.is_market_open():
            for symbol in symbols:
                symbol = symbol.upper()
                try:
                    fetch_save(symbol)
                except Exception as e:
                    logger.exception(e)
        sleep_to_next_minute()
Пример #39
0
def transmit_book_to_client(rkey = None):
	logger.warning('RKEY:::::::::::::::%s'%rkey)
	cumulative_book = rcon.get(rkey)
	logger.warning('CUMULATIVE BOOK %s'%cumulative_book)
	logger.warning('CUMULATIVE BOOK %s'%type(cumulative_book))
	try:
		buy_side, sell_side = json.loads(rcon.get(rkey))
		socketio.emit('orderbook update', 
				{'buy_side':buy_side, 'sell_side': sell_side}, 
				namespace='/client')
		logger.debug('Sent orderbook volume to client')
	except TypeError, ValueError:
		logger.exception('OADIJOASIDJAOISDJOASIJDOASIDJ')
Пример #40
0
    def gets_a_pdf(self, link, base_url):

        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} is a pdf".format(absolute_url))

        start = time()
        try:
            self.r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format(self.r.status_code, absolute_url)
                return False

            if self.is_a_pdf_page():
                return True

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in gets_a_pdf"
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except Exception as e:
            self.error += u"ERROR: Exception error in gets_a_pdf"
            logger.exception(self.error)

        if DEBUG_SCRAPING:
            logger.info(u"we've decided this ain't a PDF. took {} seconds [{}]".format(
                elapsed(start), absolute_url))
        return False
Пример #41
0
	def queue_daemon(self, rv_ttl=500):
		""" 
		The daemon that listens for incoming orders. Must be run in a separate process. 
		All received orders are stored in the database
		"""
		while True:
			logger.debug('Waiting for orders...')
			order_form_data = self.redis.blpop(prefixed(self.uuid))
			order_form_data = loads(order_form_data[1])
			new_order = Order(**order_form_data)
			self.store_order(new_order)
			try:
				response = self.process_order(new_order)
				logger.debug('Finished processing order.')
			except Exception, e:
				logger.exception(e)
				response = e
Пример #42
0
def do_get_response(tx_id):
    try:
        uuid.UUID(tx_id, version=4)
    except ValueError:
        raise InvalidUsageError("tx_id supplied is not a valid UUID", 400)

    result = get_responses(tx_id=tx_id)
    if result:
        try:
            result_dict = object_as_dict(result.items[0])['data']
            response = jsonify(result_dict)
            response.headers['Content-MD5'] = hashlib.md5(response.data).hexdigest()
            return response
        except IndexError:
            logger.exception('Empty items list in result.')
            return jsonify({}), 404
    else:
        return jsonify({}), 404
Пример #43
0
 def safe_get_next_record(self, current_record):
     self.error = None
     try:
         next_record = current_record.next()
     except (requests.exceptions.HTTPError, requests.exceptions.SSLError):
         logger.info(u"requests exception!  skipping")
         self.error = u"requests error in safe_get_next_record; try again"
         return None
     except (KeyboardInterrupt, SystemExit):
         # done
         return None
     except StopIteration:
         logger.info(u"stop iteration! stopping")
         return None
     except Exception as e:
         logger.exception(u"misc exception!  skipping")
         self.error = u"error in safe_get_next_record"
         return None
     return next_record
Пример #44
0
    def get_pmh_input_record(self, first, last, use_date_default_format=True):
        args = {}
        args['metadataPrefix'] = 'oai_dc'
        pmh_records = []
        self.error = None

        my_sickle = self.get_my_sickle(self.pmh_url)
        logger.info(u"connected to sickle with {}".format(self.pmh_url))

        args['from'] = first.isoformat()[0:10]
        if use_date_default_format:
            args['from'] += "T00:00:00Z"

        if last:
            args["until"] = last.isoformat()[0:10]
            if use_date_default_format:
                args['until'] += "T00:00:00Z"

        if self.pmh_set:
            args["set"] = self.pmh_set

        logger.info(u"calling ListRecords with {} {}".format(self.pmh_url, args))
        try:
            pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args)
            # logger.info(u"got pmh_records with {} {}".format(self.pmh_url, args))
            pmh_input_record = self.safe_get_next_record(pmh_records)
        except NoRecordsMatch as e:
            logger.info(u"no records with {} {}".format(self.pmh_url, args))
            pmh_input_record = None
        except Exception as e:
            if use_date_default_format:
                return(self.get_pmh_input_record(first, last, use_date_default_format=False))

            logger.exception(u"error with {} {}".format(self.pmh_url, args))
            pmh_input_record = None
            self.error = u"error in get_pmh_input_record: {} {}".format(
                e.__class__.__name__, unicode(e.message).encode("utf-8"))
            if my_sickle:
                self.error += u" calling {}".format(my_sickle.get_http_response_url())

        return (pmh_input_record, pmh_records, self.error)
Пример #45
0
def get_data_file(filename):
    if logger:
        logger.debug("get_data_file Started.")

    results = {"status": {"http_code": 404}, "contents": {}}
    ret_code = 404

    try:
        with open(filename, "r") as data_file:
            # results['status']['http_code'] = 200
            # results['contents'] = simplejson.load(data_file)
            results = data_file.read()
            ret_code = 200

    except (Exception, IOError) as e:
        if logger:
            logger.exception(e)

    if logger:
        logger.debug("get_data_file Finished.")

    return results, ret_code
Пример #46
0
 def appdispatch(self, environ, start_response):
     local.request = Request(environ)
     local.response = Response()
     local.session = Session(local.request.cookies.get("session"), 600)
     try:
         local.url_adapter = url_adapter = url_map.bind_to_environ(environ)
         try:
             endpoint, params = url_adapter.match()
         except NotFound:
             endpoint = "notfound"
             params = {}
         local.endpoint = endpoint
         endpoints[endpoint](**params)
     except:
         if self.debug:
             raise
         else:
             logger.exception("Exception")
             endpoints["error"]()
     response = local.response
     local.session.save()
     local.session.set_cookie(local.response)
         
     return response(environ, start_response)
Пример #47
0
def corpus_raw():
    """Collects raw corpus data."""

    raw, source_lang, target_lang = \
        map(lambda x: request.form[x], ('raw', 'sl', 'tl'))

    try:
        # See if 'raw' is a valid JavaScript string
        parsed = parse_javascript(raw)

        # Then insert it to the database
        CorpusRaw.insert(
            hash=hashlib.sha1(raw.encode('utf-8')).hexdigest(),
            raw=json.dumps(parsed),
            source_lang=source_lang,
            target_lang=target_lang,
        )
    except Exception as e:
        logger.exception(e)
        db.session.rollback()

        return str(e), 500

    return ''
Пример #48
0
def translate_v1_0():
    """
    :param sl: source language
    :type sl: string
    :param tl: target language
    :type tl: string
    :param m: mode ( 1 for normal, 2 for better )
    :type m: int
    :param t: text to be translated
    :type t: string

    Translates given text.

    **Example Request**:

    .. sourcecode:: http

        POST /v1.0/translate HTTP/1.1
        User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.99 Safari/537.22
        Host: 192.168.0.185:5000
        Accept: */*
        Content-Length: 57
        Content-Type: application/x-www-form-urlencoded

        sl=ko&tl=en&m=2&t=여러분이 몰랐던 구글 번역기

    **Example Response**

    .. sourcecode:: http

        HTTP/1.0 200 OK
        Content-Type: application/json
        Content-Length: 90
        Server: Werkzeug/0.8.3 Python/2.7.3
        Date: Wed, 10 Apr 2013 06:43:13 GMT

        {
          "translated_text": "Google translation that you did not know",
          "serial_b62": "0z19x",
          "intermediate_text": "\u7686\u3055\u3093\u304c\u77e5\u3089\u306a\u304b\u3063\u305fGoogle\u306e\u7ffb\u8a33"
        }

    **Example iOS Code using ILHTTPClient**

    ILHTTPClient: https://github.com/isaaclimdc/ILHTTPClient

    .. sourcecode:: objective-c

        ILHTTPClient *client = [ILHTTPClient clientWithBaseURL:@"http://translator.suminb.com/" showingHUDInView:self.view];
            NSDictionary *params = @{
                                        @"sl": @"en",
                                        @"tl": @"ko",
                                        @"m": @"2",
                                        @"t": @"Google translation that you did not know."
            };

            [client postPath:@"/v1.0/translate"
                  parameters:params
                 loadingText:@"Loading..."
                 successText:@"Success!"
               multiPartForm:^(id<AFMultipartFormData> formData) {
               }
                     success:^(AFHTTPRequestOperation *operation, NSString *response) {
                         NSLog(@"%@", response);
                     }
                     failure:^(AFHTTPRequestOperation *operation, NSError *error) {
                     }
            ];
    """  # noqa
    keys = ('t', 'm', 'sl', 'tl')
    text, mode, source, target = map(lambda k: request.form[k].strip(), keys)

    try:
        return jsonify(translate(text, mode, source, target))

    except HTTPException as e:
        return e.message, e.status_code

    except Exception as e:
        logger.exception(e)
        return str(e), 500
Пример #49
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
                u"ncbi.nlm.nih.gov",
                u"europepmc.org",
                u"/europepmc/",
                u"pubmed",
                u"elar.rsvpu.ru",  #these ones based on complaint in email
                u"elib.uraic.ru",
                u"elar.usfeu.ru",
                u"elar.urfu.ru",
                u"elar.uspu.ru"]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(u"not scraping {} because is on our do not scrape list.".format(url))
                return

        try:
            self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"is not a PDF for {}.  continuing more checks".format(url))

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url), "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a PDF download link: {} {} [{}]".format(
                        pdf_download_link.href, pdf_download_link.anchor, url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link.href, self.r.url), url))
                self.scraped_open_metadata_url = url
                return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(u"found no PDF download link.  end of the line. [{}]".format(url))

        return self
Пример #50
0
    def scrape_for_fulltext_link(self):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)
            resolved_landing_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in self.r.url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(landing_url))
                self.scraped_pdf_url = landing_url
                self.open_version_source_string = "open (via free pdf)"
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"landing page is not a PDF for {}.  continuing more checks".format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(u'error parsing html, skipped script removal: {}'.format(e))

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via free pdf)"

            # now look and see if it is not just free, but open!
            says_open_url_snippet_patterns = [
                ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'),
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'),
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'),
            ]

            for (url_snippet, pattern) in says_open_url_snippet_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if url_snippet in resolved_landing_url.lower() and matches:
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            says_open_access_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'),
            ]
            for (publisher, pattern) in says_open_access_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL)
                if self.is_same_publisher(publisher) and matches:
                    self.scraped_license = "implied-oa"
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"

            license_patterns = [
                ur"(creativecommons.org/licenses/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>'
            ]

            for pattern in license_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if matches:
                    self.scraped_license = find_normalized_license(matches[0])
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via page says license)"

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this is open! took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this doesn't say open. took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False
Пример #51
0
def get_chorus_data(starting_offset=0, agency_id=None):
    requests_session = requests.Session()
    retries = Retry(total=10,
                backoff_factor=0.5,
                status_forcelist=[500, 502, 503, 504])
    requests_session.mount('http://', DelayedAdapter(max_retries=retries))
    requests_session.mount('https://', DelayedAdapter(max_retries=retries))

    agencies = get_chorus_agencies()
    for agency in agencies:
        if agency_id:
            if int(agency["Agency_Id"]) != int(agency_id):
                print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"])
                continue
        if starting_offset:
            offset = starting_offset
        else:
            offset = 0

        logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"]))
        url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}"
        limit = 50
        total_results = None
        while total_results==None or offset < total_results:
            loop_start = time()
            url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit)
            print url
            try:
                r = requests_session.get(url, timeout=360)  # wait for 3 minutes
            except Exception, e:
                logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8")))
                r = None

            print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1))
            offset += limit

            if r:
                data = r.json()
                total_results = data["total_results"]
                logger.info(u"Has {} total results, {} remaining".format(
                    total_results, total_results - offset))


                items = data["items"]
                new_objects = []
                for item in items:
                    if item["DOI"]:
                        doi = clean_doi(item["DOI"])
                        new_objects.append(Chorus(id=doi, raw=item))

                ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()]
                objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db]
                if objects_to_add_to_db:
                    logger.info(u"adding {} items".format(len(objects_to_add_to_db)))
                    db.session.add_all(objects_to_add_to_db)
                    safe_commit(db)
                else:
                    logger.info(u"all of these items already in db")

            logger.info(u"sleeping for 2 seconds")
            sleep(2)
Пример #52
0
                if tst_date_obj >= start_date_obj:
                    resultList = advisoryList[ndx:]
                    break
        else:
            resultList = stationJson["properties"]["test"]["beachadvisories"][-1]

        properties = {}
        properties["desc"] = stationJson["properties"]["desc"]
        properties["station"] = stationJson["properties"]["station"]
        properties["test"] = {"beachadvisories": resultList}

        feature = geojson.Feature(id=station, geometry=stationJson["geometry"], properties=properties)
    except IOError, e:
        if logger:
            logger.exception(e)
    except ValueError, e:
        if logger:
            logger.exception(e)
    except Exception, e:
        if logger:
            logger.exception(e)
    try:
        if feature is None:
            feature = geojson.Feature(id=station)

        json_data = {"status": {"http_code": 202}, "contents": feature}
    except Exception, e:
        if logger:
            logger.exception(e)