Exemplo n.º 1
0
def update_image_storage():
    """This operation should clear DB if run multiple times on the same DB

    8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB entry handling,
              2 for correct regex, 2 for Transaction explanation
    -0.5 for small mistakes (return value...)

    :return: json if successful or not
    """

    # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling.
    answer = requests.get(BASE_URL_DATASET)
    # Error handling

    # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment)
    with database_holder.database.transaction():
        # Empty databases
        Image.delete().execute()  # pylint: disable=no-value-for-parameter
        Caption.delete().execute()  # pylint: disable=no-value-for-parameter

        # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed).
        tree = None

        # TODO After parsing the XML tree, please use the xpath method to iterate over all elements
        for pictureTree in tree.xpath(''):

            # TODO get image src by xpath method, you can check lxml documentation or use a debugger to find attributes
            src = None

            # TODO parse category by appling a regex to src, probably check out regex101.com
            # check out re docs of Python3
            category = None

            # save Image in DB, nothing magical here
            imageDb = Image(src=src, category=category)
            imageDb.save()

            # TODO iterate over all captions by using xpath method. Try to make the xpath expression as short as
            # possible
            for captionTree in []:
                caption_text = ''
                Caption(text=caption_text, image=imageDb).save()

    return json.dumps({'status': 'finished'}), 200
Exemplo n.º 2
0
def update_image_storage():
    """This operation should clear DB if run multiple times on the same DB

    8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB handling,
              1 for correct regex, 1 for Transaction explanation
    -0.5 for small mistakes (return value...)

    :return: json if successful or not
    """

    answer = requests.get(BASE_URL_DATASET)
    # Error handling
    answer.raise_for_status()

    # This line starts a new transaction and automatically commits it at the end of the with-clause
    # It is needed because database operations can fail. Then, the transaction would have to be aborted.
    # The with-clause also takes care of this and issues a rollback.
    with database_holder.database.transaction():
        # Empty databases
        Image.delete().execute()  # pylint: disable=no-value-for-parameter
        Caption.delete().execute()  # pylint: disable=no-value-for-parameter

        tree = html.fromstring(answer.text)

        # for every picture (corresponds to tr)
        for pictureTree in tree.xpath('/html/body/table/tr'):
            # get source and category
            src = pictureTree.xpath('td/img/@src')[0]
            category = re.match(r'(\w+)\/', src).group(1)

            # save Image in DB, nothing magical here
            imageDb = Image(src=src, category=category)
            imageDb.save()

            #  get all captions and save them
            for captionTree in pictureTree.xpath('td//td'):
                caption_text = captionTree.text[1:]
                Caption(text=caption_text, image=imageDb).save()

    return json.dumps({'status': 'finished'}), 200
def update_image_storage():
    """This operation should clear DB if run multiple times on the same DB

    8 Points: 3 for xpath (1 each), 1 for correct resource GET, 1 for correct DB cleaning, 1 for correct DB saving,
              2 for correct regex, 2 for Transaction explanation
    -0.5 for small mistakes (return value...)

    :return: json if successful or not
    """

    # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling.
    try:
        page = requests.get(BASE_URL_DATASET)
    except requests.exceptions.Timeout:
        print('A timeout occured.')
        # Maybe set up for a retry, or continue in a retry loop
    except requests.exceptions.TooManyRedirects:
        print('Too many redirects were made.')
        # Tell the user their URL was bad and try a different one
    except requests.exceptions.RequestException as e:
        print('An error occured', e)
        # catastrophic error. bail.
        sys.exit(1)


    # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment)
    # The `with` keyword guarantees that some cleanup routine for the to-be-executed routine is implicitly run
    # after the scope exits. In this particular case, the clean-up-routine is the return statement, such that
    # the 'status': 'finished' value is guaranteed to be sent together with the 200 status code.
    with database_holder.database.transaction():
        # Empty databases
        Image.delete().execute()  # pylint: disable=no-value-for-parameter
        Caption.delete().execute()  # pylint: disable=no-value-for-parameter

        # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed).
        tree = html.fromstring(page.text)
        
        # "status": "/html/body/table/tr[1000]/td[2]/table/tr[5]/td"
        
        pictureTrees = tree.xpath('/html/body/table/tr');

        # TODO After parsing the XML tree, please use the xpath method to iterate over all elements
        for index, pictureTree in enumerate(pictureTrees, start=1):
            
            # print('processing pictureTree #', index);
                   
            # Extract the source attribute
            src = next(iter(pictureTree.xpath('td[1]/img/@src')), None)
            if src == None:
                continue  # skip entry if no image is in row

            # print('src is ', src);

            # Take only substring with category descriptor
            category = re.match('^(\w.*)\/', src).group(1)
            if category == None:
                continue  # skip entry if category could can't be extracted
            
            # print('category is ', category);

            # Save Image in DB, nothing magical here
            imageDb = Image(src=src, category=category)
            imageDb.save()

            # print('saved image entry!');

            # Save the captions additionally
            for captionTree in pictureTree.xpath('td[2]/table/*/td/text()'):
                # Remove whitespaces on edges
                caption_text = captionTree.strip()
                Caption(text=caption_text, image=imageDb).save()
                
                # print('Added caption', caption_text);

    return json.dumps({'status': 'finished'}), 200