def pdfjob():
    """
    Function called by APScheduler BackgroundScheduler to kick off PDF parsing script.
    Uses try/except block to call function. This is done because quit() is called within the script as flow control
    to end it early, if this exception is not caught then an exception is raised, while the script will continue to
    run on schedule, it will pollute logs with tracebacks.

    Returns
    -------
    Nothing
    """

    application.logger.debug("PDF job issued, downloading and parsing PDF")
    try:
        parsePDF()
        application.logger.debug("Successfully parsed a new PDF!")
    except SystemExit:
        logger.debug(
            "Ended ParsePDF early since file has already been processed")
        print("Ended ParsePDF job early")
    except Exception as e:
        # print("Parse PDF threw an error, emailing exception")
        application.logger.error("Parse PDF threw an error")
        application.logger.error(e)
        errorEmail.senderroremail(script="ParsePDF",
                                  exceptiontype=e.__class__.__name__,
                                  body=e)
Пример #2
0
 def __init__(self, message, status_code=None, payload=None):
     Exception.__init__(self)
     self.message = message
     logger.debug(message)
     if status_code is not None:
         self.status_code = status_code
     self.payload = payload
Пример #3
0
 def __init__(self, email, password, first_name, last_name, grade):
     self.email = email
     self.password = generate_password_hash(password)
     self.first_name = first_name
     self.last_name = last_name
     self.grade = grade
     logger.debug('here')
     self.img = 'https://s3-us-west-2.amazonaws.com/libfeed/default-medium.png'
Пример #4
0
 def __init__(self, email, password, first_name, last_name, grade):
     self.email = email
     self.password = generate_password_hash(password)
     self.first_name = first_name
     self.last_name = last_name
     self.grade = grade
     logger.debug('here')
     self.img = 'https://s3-us-west-2.amazonaws.com/libfeed/default-medium.png'
Пример #5
0
 def drop_tables(cls):
     models = BaseModel.all_models(cls)
     for model in models:
         table = model._meta.db_table
         try:
             model.drop_table(cascade=True)
             logger.debug('Destroyed table: %s', table)
         except (ProgrammingError, OperationalError):
             logger.warn('Attempted to destroy non-existent table: %s', table)
Пример #6
0
 def create_tables(cls):
     models = BaseModel.all_models(cls)
     for model in models:
         table = model._meta.db_table
         try:
             model.create_table()
             logger.debug('Created table: %s', table)
         except ProgrammingError:
             logger.warn('Attempted to create existing table: %s', table)
def genReSampleDict(tab, pdfDate):
    """
    Generates nested dictionary of re-sampled and fill-in records, with beach names and column names as keys.

    :param tab: Nested list of cleaned table records
    :param pdfDate: String of PDF Date
    :return:
    Nested dictionary containing re-sampled records and filled-in records, with beach names as keys
    """
    application.logger.debug(
        "Generating beach dictionary with resampled and data fill-ins")
    # Pass the resample list back through the cleantext function, for some reason it doesnt appear to be processed
    # properly
    # tab = cleanText(tab)
    resampBeaches = []
    combinedBeaches = []
    resampTab = [tab[0]]
    newRecTab = [tab[0]]
    # Get list of null beaches
    nullBeaches = DBQueriesWaterQuality.getNullBeaches(pdfDate)
    application.logger.debug(f"Null beaches are {nullBeaches}")
    # Iterate over all records in the table
    for row in range(1, len(tab)):
        # Check each beach name, index 0 in the nested list, to see if it contains "sample", meaning it was resampled
        if "sample" in tab[row][0]:
            # print(f"Testing {tab[row][0]}")
            resampRow = tab[row]
            resampRow[0] = resampRow[0].split(' Re')[0].rstrip(" ")
            logger.debug(f"Adding {resampRow[0]} to resample beach list")
            # Add to resample beach list
            resampBeaches.append(resampRow[0])
            # Add to resample table
            for item in resampRow[1:]:
                if " " in item:
                    resampRow[resampRow.index(item)] = item.split(" ")[0]
            resampTab.append(resampRow)
        elif tab[row][0] in nullBeaches and tab[row][1] is not None:
            # print("This re-sample PDF is also filling in missing data")
            # Add beach name to the combined beaches list
            # application.logger.debug(f"Adding the following beach to the combined beaches list {tab[row][0]}")
            # print(f"Records to be appended are {tab[row]}")
            combinedBeaches.append(tab[row][0])
            # Add table row to the new records list
            newRecTab.append(tab[row])
    # Combine the beach names
    combinedBeaches = resampBeaches + combinedBeaches
    # print(f"Combined beach names list is {combinedbeaches}")
    # Use the beach names to generate a template dictionary
    combinedDict = genDict(combinedBeaches)
    # print(f"Template re-sample dictionary is {combinedDict}")
    # print(f"Re-sample table is {resampTab}")
    # Populate the dictionary with the re-sample data
    combinedDict = populateDict(resampTab, combinedDict, "Yes")
    # Populate the dictionary with the new record data
    combinedDict = populateDict(newRecTab, combinedDict, "No")
    return combinedDict
Пример #8
0
def t_find_pdf_primarily(self, pub_id: str, title: str, authors: list,
                         url: str):
    resd = {"status": "ok"}

    if url:
        files_path = get_config("FILES_PATH")

        file_name = md5(url.encode("utf-8")).hexdigest()

        if not os.path.exists(files_path):
            os.makedirs(files_path)

        pdf_raw = download(url)

        full_path = f'{files_path}{os.path.sep}{file_name}.pdf'

        with open(full_path, "wb+") as f:
            f.write(pdf_raw)

        resd["path"] = full_path

        try:
            content = extract_text_from_pdf(full_path)
        except Exception as e:
            resd["extraction_failure"] = str(e)
            logger.debug(e)
            content = None

        update_result = update_one("publication", {
            "filter": {"id": {"$eq": pub_id}},
            "update": {
                "$set": {
                    "raw_base64": base64.encodebytes(pdf_raw).decode("utf-8"),
                    "content": content
                }
            },
            "upsert": True
        })

        logger.info(f'Update Result: {update_result}')

        t_elasticsearch_indexing.apply_async((pub_id,))
    else:
        authors = find("author", {
            "filter": {"id": {"$in": authors}},
            "projection": {"name": 1}
        })
        t_find_pdf_secondarily.apply_async(
            (pub_id, title, [a["name"] for a in authors])
        )

    return resd
def populateDict(tab, beachDict, resample):
    """
    Populates test results dictionary structure with test result values for each beach.

    Parameters
    ----------
    tab: Nested list with cleaned beach results
    beachDict: Dictionary with structure but empty values, will be mutated.
    resample: String of re-resample status

    Returns
    -------
    Mutates and returns input beachDict with beach test results.
    """
    col = [
        'Total Coliform Results (MPN*)',
        'Total Coliform State Health Standard (MPN*)',
        "Fecal Coliform Results (MPN*)",
        'Fecal Coliform State Health Standard (MPN*)',
        'Enterococcus Results (MPN*)',
        'Enterococcus State Health Standard (MPN*)',
        'Exceeds FC:TC ratio standard **', 'Beach Status', 'fk'
    ]

    # Iterate over table skipping row one, which is column names, and use the row index number
    # print("Inside pop dict func")
    for row in range(1, len(tab)):
        # print(f"Working on row {tab[row]}")
        logger.debug(f"Processing {tab[row]}")
        # For every row in the table, iterate over the columns, ignoring the first column (beach name),
        # since this is the key value. Use the column index to call on the column names list, which acts as a lookup
        # for the dictionary key value (column name) to be added to the 2nd level dictionary
        for i in range(1, (len(tab[row]))):
            logger.debug(
                f"Filling key {beachDict[tab[row][0]][col[i - 1]]} with value {tab[row][i]}"
            )
            # col[i-1] is needed since the loop is starting at index 1 to avoid iterating over the beach name in the
            # original list (table), this index is needed to grab the proper column name(key) starting at index 0,
            # so its decreased by 1 to maintain proper index location for filling in data
            # beachDict[tab[row][0]][col[i-1]] = tab[row][i]
            if tab[row][i] is not None:
                beachDict[tab[row][0]][col[i - 1]] = tab[row][i].rstrip(" ")
            else:
                beachDict[tab[row][0]][col[i - 1]] = None
            beachDict[tab[row][0]]['resample'] = resample
    return beachDict
Пример #10
0
def connect(uri):
    """
    Connects to the database at the given uri.
    """
    global db_proxy

    if uri:
        logger.debug('Connected to db:%s.' % uri)
        parsed = urlparse(uri)
        db = PostgresqlDatabase(database=parsed.path[1:],
                                user=parsed.username,
                                password=parsed.password,
                                host=parsed.hostname,
                                autorollback=True)
        db.connect()
        db_proxy.initialize(db)
    else:
        logger.error('Could not connect to the database.')
Пример #11
0
    def from_json(cls, json_str):
        dictionary = json.loads(json_str)
        all_objects = dictionary[dictionary.keys()[0]]
        created = []

        for obj in all_objects:
            table = cls._meta.db_table
            logger.debug('Adding a row to %s', table)

            c = cls()
            for key in obj.keys():
                val = obj[key]
                key = underscore(key)
                if key.endswith('date'):
                    val = parser.parse(val)
                    val = (val - val.utcoffset()).replace(tzinfo=None)
                setattr(c, key, val)
            c.save()
            created.append(c)

        return created
Пример #12
0
def check_function_task(self, file_path, user_id, q_id, function_name, args,
                        answers, timeout):
    """task that will check a submission"""
    # this needs refactoring !
    results = []
    total = len(args)
    status = 'Unsucessful'
    sub_result = False

    try:
        for i, arg in enumerate(args):
            if q_id in QUESTIONS_WTIH_FILES:
                arg[0] = os.path.join('tests', 'resources', arg[0])
                if q_id in NESTED:
                    results.append(
                        check_functions(file_path,
                                        function_name,
                                        arg,
                                        answers[i],
                                        timeout,
                                        nested=True))
                else:
                    results.append(
                        check_functions(file_path,
                                        function_name,
                                        arg,
                                        answers[i],
                                        timeout,
                                        unbracket=True))

            elif q_id in NO_UNPACK:
                results.append(
                    check_functions(file_path,
                                    function_name,
                                    arg,
                                    answers[i],
                                    timeout,
                                    no_unpack=True,
                                    unbracket=True))

            elif q_id in NESTED:
                results.append(
                    check_functions(file_path,
                                    function_name,
                                    arg,
                                    answers[i],
                                    timeout,
                                    nested=True))

            else:
                results.append(
                    check_functions(file_path,
                                    function_name,
                                    arg,
                                    answers[i],
                                    timeout,
                                    unbracket=True))

            self.update_state(state='PROGRESS',
                              meta={
                                  'current': i,
                                  'total': total,
                                  'status': 'hold on m8!'
                              })

        if all([x['result'] for x in results]):
            status = 'Successful!'
            sub_result = True

    except SoftTimeLimitExceeded:
        send_slack_message('SoftTimeLimitExceeded\n Submission Details:\n')
        send_function_failure(file_path, user_id, q_id, function_name, args,
                              answers)
        logger.debug('Soft time exceeded')
        logger.debug(
            json.dumps({
                'question_name': function_name,
                'current': i,
                'q_id': q_id,
                'total': total,
                'status': status,
                'result': results
            }))
        s3_send(file_path, os.path.basename(file_path))

    con, meta = connect()
    con.execute(meta.tables['results'].insert().values(
        user=int(user_id),
        question=q_id,
        submission_result=sub_result,
        created_date=datetime.now()))
    con.dispose()

    return {
        'question_name': function_name,
        'current': i,
        'q_id': q_id,
        'total': total,
        'status': status,
        'result': results
    }
Пример #13
0
import base64
Пример #14
0
def t_find_pdf_secondarily(self, pub_id: str, title: str, authors: list):
    resd = {"status": "ok"}

    try:
        # Her authoru tek tek kontrol etmemizi sağlayan for döngüsü
        for single_author in authors:
            # author için istek atıyoruz
            http = urllib3.PoolManager()
            response = http.request(
                'GET', 'https://libgen.is/scimag/?q=' + single_author)
            html_text = response.data
            soup = BeautifulSoup(html_text, 'html.parser')

            # arama sonucunda data döndü mü onu kontrol ediyoruz
            try:
                total_value = str(
                    soup.find('div', attrs={
                        'style': 'float:left'
                    }).getText()).split(" ")[0]
            except Exception:
                total_value = 0
            # eğer arama sonucunda bir data dönmedi ise diğer yazare
            # geçmesi için continue diyoruz döngüye
            if total_value == 0:
                continue

            # burada sayfa sayısını hesaplıyoruz. double ile bölmede kalan
            # muhabbetlerinden ötürü kontrol yapıp gerekliyse
            # toplam sayfa sayısına bir ekliyoruz en son sayfayı ıskalamamak için
            total_page_dobule = int(total_value) / 25
            total_page = int(int(total_value) / 25)
            if total_page != total_page_dobule:
                total_page += 1

            # Burda bir yazarın sonuçlarını taramak için sayfalarda geziyoruz.
            # İlk sayfa için yukarıda istek atmıştık 0'dan farklı bir sonuç
            # sayısı varsa buraya gelmiştik.
            # bu yüzden ilk sayfa için istek atmıyoruz.
            # eğer ilk sayfada sonuç bulunmazsa ve sayfa sayısı 1'den büyük
            # ise döngünün en sonunda istek atıyor
            # ve döngü yeni sayfanın içinde arama yapacak şekilde devam ediyor
            for i in range(total_page):
                counter = 0
                for row in soup.find_all('tr'):
                    if counter == 0:  # For initial row. Because it contains table information of page
                        counter += 1
                        continue
                    row_item = row.find_all('td')
                    row_title = row_item[1].find_all('a')[0].text
                    ratio = fuzz.ratio(row_title.lower(), title.lower(
                    ))  # row title ve verilen title benzer mi diye bakılıyor

                    if ratio > 75:
                        url_for_get = row_item[4].find_all('li')
                        href = url_for_get[1].find_all('a',
                                                       href=True)[0]['href']
                        response_for_pdf = http.request('GET', href)
                        pdf_page = BeautifulSoup(response_for_pdf.data,
                                                 'html.parser')
                        pdf_url = pdf_page.find_all(
                            'td', {'align': 'center'})[0].find_all(
                                'a', href=True)[0]['href']

                        pdf_raw = download(pdf_url)

                        files_path = get_config("FILES_PATH")

                        if not os.path.exists(files_path):
                            os.makedirs(files_path)

                        file_name = md5(pdf_url.encode("utf-8")).hexdigest()

                        full_path = f'{files_path}{os.path.sep}{file_name}.pdf'

                        with open(full_path, "wb+") as f:
                            f.write(pdf_raw)

                        resd["path"] = full_path

                        try:
                            content = extract_text_from_pdf(full_path)
                        except Exception as e:
                            resd["extraction_failure"] = str(e)
                            logger.debug(e)
                            content = None

                        update_one(
                            "publication", {
                                "filter": {
                                    "id": {
                                        "$eq": pub_id
                                    }
                                },
                                "update": {
                                    "$set": {
                                        "raw_base64":
                                        base64.encodebytes(pdf_raw).decode(
                                            "utf-8"),
                                        "content":
                                        content
                                    }
                                },
                                "upsert": True
                            })

                        if content:
                            logger.info(f'Content is added to publication.')

                            t_elasticsearch_indexing.apply_async((pub_id, ))

                            return resd

                if total_page > 1:
                    response = http.request(
                        'GET', 'https://libgen.is/scimag/?q=' + single_author +
                        '&page=' + str(i + 2))
                    html_text = response.data
                    soup = BeautifulSoup(html_text, 'html.parser')
    except Exception as e:
        logger.exception(e)

    t_elasticsearch_indexing.apply_async((pub_id, ))

    return resd
Пример #15
0
def testing_get_2(username="******"):
    logger.debug("the username variable is = " + username)
    return "you made a GET request with a username = " + username
Пример #16
0
def testing_get_2(username="******"):
    logger.debug("the username variable is = " + username)
    return "you made a GET request with a username = " + username
Пример #17
0
def check_function_task(self, file_path, user_id, q_id, function_name, args,
                        answers, timeout):
    """task that will check a submission"""
    # this needs refactoring !
    results = []
    total = len(args)
    status = 'Unsucessful'
    sub_result = False

    try:
        for i, arg in enumerate(args):
            if q_id in QUESTIONS_WTIH_FILES:
                arg[0] = os.path.join('tests', 'resources', arg[0])
                if q_id in NESTED:
                    results.append(
                        check_functions(file_path,
                                        function_name,
                                        arg,
                                        answers[i],
                                        timeout,
                                        nested=True))
                else:
                    results.append(
                        check_functions(file_path,
                                        function_name,
                                        arg,
                                        answers[i],
                                        timeout,
                                        unbracket=True))

            elif q_id in NO_UNPACK:
                results.append(
                    check_functions(file_path,
                                    function_name,
                                    arg,
                                    answers[i],
                                    timeout,
                                    no_unpack=True,
                                    unbracket=True))

            elif q_id in NESTED:
                results.append(
                    check_functions(file_path,
                                    function_name,
                                    arg,
                                    answers[i],
                                    timeout,
                                    nested=True))

            else:
                results.append(
                    check_functions(file_path,
                                    function_name,
                                    arg,
                                    answers[i],
                                    timeout,
                                    unbracket=True))

            self.update_state(state='PROGRESS',
                              meta={
                                  'current': i,
                                  'total': total,
                                  'status': 'hold on m8!'
                              })

        if all([x['result'] for x in results]):
            status = 'Successful!'
            sub_result = True

    except SoftTimeLimitExceeded:
        send_slack_message('SoftTimeLimitExceeded\n Submission Details:\n')
        send_function_failure(file_path, user_id, q_id, function_name, args,
                              answers)
        logger.debug('Soft time exceeded')
        logger.debug(
            json.dumps({
                'question_name': function_name,
                'current': i,
                'q_id': q_id,
                'total': total,
                'status': status,
                'result': results
            }))
        s3_send(file_path, os.path.basename(file_path))

    con, meta = connect()
    con.execute(meta.tables['results'].insert().values(
        user=int(user_id),
        question=q_id,
        submission_result=sub_result,
        created_date=datetime.now()))
    con.dispose()

    return {
        'question_name': function_name,
        'current': i,
        'q_id': q_id,
        'total': total,
        'status': status,
        'result': results
    }


# @celery.task(name='celery_tasks.check_console_task')
# def check_console_task(test_file, q_id):
#     """task that will run python in a separate process and parse stdout"""
#     # query DB for question and get info required
#     q_name = ''
#     answers = 42
#     return check_console(test_file, q_name, answers)

# @celery.task(name='celery_tasks.spam_users')
# def spam_users(file_path, function_name, q_id, answers):
#     # hassle users that have stopped logging in and completing questions...?
#     # could check for last attempt in results table if its been over
#     # 2 weeks send them an email saying we miss you try this question
#     # send_reminder_emailp(recipient, email, question, url)
#     pass

# @celery.task(name='celery_tasks.clean_up')
# def clean_up(file_path, function_name, q_id, answers):
#     """task that will clean up uploads directory, run once a day?"""
#     pass

# @celery.task(name='celery_tasks.check_question_set')
# def check_question_set(file_path, function_name, q_id, answers):
#     """task that will check for new questions"""
#     pass