Exemplos de Page em Python, exemplos de Lib.Recovery.Content.Page em Python

Exemplo n.º 1

0

Exibir arquivo

def test_getPage(data):
    """
    Asserting the creator's URL request with the Page() class and updating its
    value as a parameter

    Args:
        data: fixture data to test Page()
    """

    # invocing creation method
    url = pytest.url
    dialect = pytest.dialect
    testPage = Page(pytest.url, dialect=dialect)

    # asserting equal URL, dialect, request, head soup and body soup
    assert testPage.url == url
    assert testPage.dialect == dialect
    assert testPage.request is None

    # requesting  webpage with existing URL
    status = testPage.getPage()
    assert testPage.url == url
    assert status == 200

    # requesting  webpage updating the URL
    url = pytest.altUrl
    status = testPage.getPage(url)
    assert testPage.url == pytest.altUrl
    assert status == 200

Exemplo n.º 2

0

Exibir arquivo

Arquivo: Model.py Projeto: sa-artea/VVG-Gallery-Scrapy

    def __init__(self, *args, **kwargs):
        """
        creator of the class gallery()

        Args:
            webg_path (str): URL for the gallery to scrap data
            localg_path (str): local dirpath for the gallery data
            schema (list): array with the column names for the model
            data_frame (data_frame, optional): panda df with data (ie.: paints)
            in the gallery, you can pass an existing df, Default is empty
            wpage (Page): the current webpage the controller is scrapping

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            Model (Model): return a new Model() object
        """
        try:

            # default creator attributes
            self.webg_path = str()
            self.localg_path = str()
            self.imgd_path = str()
            self.schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA)
            self.data_frame = pd.DataFrame(columns=DEFAULT_FRAME_SCHEMA)
            self.wpage = Page()

            # when arguments are pass as parameters
            if len(args) > 0:
                for arg in args:
                    # URL of the remote gallery to scrap
                    if args.index(arg) == 0:
                        self.webg_path = arg

                    # local dirpath to save the gallery CSV
                    if args.index(arg) == 1:
                        self.localg_path = arg

                    # local dirpath to save the images
                    if args.index(arg) == 2:
                        self.imgd_path = arg

                    # dataframes containing the data of the gallery
                    if args.index(arg) == 3:
                        self.data_frame = arg

            # if there are dict decrators in the creator
            if len(kwargs) > 0:

                for key in list(kwargs.keys()):

                    # updating schema in the model
                    if key == "schema":
                        self.schema = copy.deepcopy(kwargs[key])
                        self.data_frame = pd.DataFrame(columns=self.schema)

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: __init__")

Exemplo n.º 3

0

Exibir arquivo

Arquivo: Model.py Projeto: sa-artea/VVG-Gallery-Scrapy

    def scrapidx(self, gurl, stime, div, attrs):
        """
        Scrap the gallery index and recover all the elements in it

        Args:
            gurl (str): gallery URL to scrap data
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine
            the search and scrap
            stime (float): waiting time between requests

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): div and attrs filtered beatifulsoup object
        """
        try:
            # reset working web page
            self.wpage = Page()
            ans = None

            # getting the basic element list from gallery online index
            self.wpage.get_collection(gurl, stime)
            ans = self.wpage.findin(div, attributes=attrs)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: scrapidx")

Exemplo n.º 4

0

Exibir arquivo

def test_findInBody(data, soup):
    """
    Asserts the soup creation with beautifulsoup library in the Page() class

    Args:
        data: fixture data to test Page()
        soup: fixture soup element dictionary to test Page()
    """

    # invocing creation method
    url = pytest.url
    dialect = pytest.dialect
    testPage = Page(url, dialect=dialect)
    # test soup data signation and prep
    div = pytest.division
    attrs = pytest.attributes
    answer = None

    # requesting page
    testPage.getPage()

    # setting soup with known parsing dialect
    testPage.setSoup()

    # finding only one element in ne URL
    answer = testPage.findInBody(div, attributes=attrs, multiple=False)

    # asserting behaviour
    assert answer != -1

    # finding only one element in ne URL
    answer = testPage.findInBody(div, attributes={}, multiple=True)
    # # asserting behaviour
    assert answer != -1
    assert len(answer) > 0

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Controller.py Projeto: sa-artea/VVG-Gallery-Scrapy

    def __init__(self, *args, **kwargs):
        """
        Controller() class creator

        Args:
            webg_path (str): URL for the gallery to scrap data
            localg_path (str): local dirpath for the gallery data
            schema (list): array with the column names for the model
            gallery (Gallery): object with the gallery dataframe model
            # wpage (Page): the current webpage the controller is scrapping

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            Controller (Model): return a new Controller() object
        """

        try:
            # Controller default values
            self.webg_path = str()
            self.localg_path = str()
            self.imgd_path = str()
            self.schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA)
            self.gallery = Gallery()
            self.wpage = Page()

            # when arguments are pass as parameters
            if len(args) > 0:
                i = 0
                for i in range(int(len(args))):

                    # URL of the remote gallery to scrap
                    if i == 0:
                        self.webg_path = args[i]

                    # local dirpath to save the gallery CSV
                    if i == 1:
                        self.localg_path = args[i]

                    # painting list containing the data of the gallery
                    if i == 2:
                        self.imgd_path = args[i]

            # if there are dict decrators in the creator
            if len(kwargs) > 0:

                for key in list(kwargs.keys()):

                    # updating schema in the controller
                    if key == "schema":
                        self.schema = copy.deepcopy(kwargs[key])

                    # setting the max size of the gallery
                    if key == "model":
                        self.gallery = kwargs[key]

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: __init__")

Exemplo n.º 6

0

Exibir arquivo

def test_setSoup(data):
    """
    Asserts the soup creation with beautifulsoup library in the Page() class

    Args:
        data: fixture data to test Page()
    """

    # invocing creation method
    url = pytest.url
    dialect = pytest.dialect
    testPage = Page(url, dialect=dialect)

    # requesting page
    status = testPage.getPage()

    # asserting request behavour and inalteration of the body soup/head
    assert testPage.request is not None
    assert testPage.sbody is None
    assert testPage.shead is None

    # setting soup with known parsing dialect
    testPage.setSoup()

    # checking the soup exists al least
    assert testPage.sbody is not None
    assert testPage.shead is not None

    # using other url
    altUrl = pytest.altUrl

    # invocing creation method
    testPage = Page(altUrl)

    # requesting page
    status = testPage.getPage()

    # asserting request behavour and inalteration of the body soup/head
    assert testPage.request is not None
    assert testPage.sbody is None
    assert testPage.shead is None

    # setting soup updating parsing dialect
    testPage.setSoup(dialect=dialect)

    assert testPage.sbody is not None
    assert testPage.shead is not None

Exemplo n.º 7

0

Exibir arquivo

Arquivo: Model.py Projeto: sa-artea/VVG-Gallery-Scrapy

    def get_imgfn(self, eurl, div, attrs):
        """
        scrap elements within a link based on the <div>, html marks
        and other attributes or decoratos

        Args:
            eurl (str): gallery's element url
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): HTML divs as a beatifulsoup object
        """
        try:

            # reset working web page
            self.wpage = Page()

            # get the headers and the content from the url
            rstatus = self.wpage.get_header(eurl)
            rstatus = self.wpage.get_content()

            ans = str()

            if rstatus == 200:
                # find attribute inside the headers
                if attrs.items() <= self.wpage.shead.items():
                    headers = self.wpage.shead
                    ans = headers.get(div)
                    ans = str(ans)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: XXXXX")

Exemplo n.º 8

0

Exibir arquivo

def test_newPage(data):
    """
    test for the __init__/creator of a new page object.

    Args:
        data: fixture data to test page()
    """
    # creator without parameters
    newPage = Page()
    # asserting equal URL, dialect, request, head soup and body soup
    assert newPage.url == ""
    assert newPage.dialect == pytest.dialect
    assert newPage.request is None
    assert newPage.shead is None
    assert newPage.sbody is None

    url = pytest.url
    dialect = "pytest.dialect"

    # creator with 1 parameter
    newPage = Page(url)

    # asserting equal URL, dialect, request, head soup and body soup
    assert newPage.url == pytest.url
    assert newPage.dialect == pytest.dialect
    assert newPage.request is None
    assert newPage.shead is None
    assert newPage.sbody is None

    # cretor with 2 parameters
    newPage = Page(url, dialect=dialect)

    # asserting equal URL, dialect, request, head soup and body soup
    assert newPage.url == pytest.url
    assert newPage.dialect != pytest.dialect
    assert newPage.request is None
    assert newPage.shead is None
    assert newPage.sbody is None

Exemplo n.º 9

0

Exibir arquivo

Arquivo: Model.py Projeto: sa-artea/VVG-Gallery-Scrapy

    def scrape(self, eurl, div, attrs, **kwargs):
        """
        scrap elements within a link based on the <div>, html marks
        and other attributes or decoratos

        Args:
            eurl (str): gallery's element url
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): HTML divs as a beatifulsoup object
        """
        try:

            # reset working web page
            self.wpage = Page()

            # get the body of the element url
            rstatus = self.wpage.get_body(eurl)
            ans = None

            if rstatus == 200:
                # find element inside the html body
                ans = self.wpage.findin(div,
                                        attributes=attrs,
                                        multiple=kwargs.get("multiple"))
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: scrape")

Exemplo n.º 10

0

Exibir arquivo

Arquivo: Model.py Projeto: sa-artea/VVG-Gallery-Scrapy

class Gallery():
    """
    this class implement the gallery of the model, containing all its
    elements (ie.: painintgs) contains all gallery data in memory and
    helps create the data_frame for it.
    """

    # =========================================
    # class variables
    # =========================================
    webg_path = str()
    localg_path = str()
    imgd_path = str()
    schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA)
    data_frame = pd.DataFrame(columns=DEFAULT_FRAME_SCHEMA)
    wpage = Page()

    # =========================================
    # functions to create a new gallery
    # =========================================
    def __init__(self, *args, **kwargs):
        """
        creator of the class gallery()

        Args:
            webg_path (str): URL for the gallery to scrap data
            localg_path (str): local dirpath for the gallery data
            schema (list): array with the column names for the model
            data_frame (data_frame, optional): panda df with data (ie.: paints)
            in the gallery, you can pass an existing df, Default is empty
            wpage (Page): the current webpage the controller is scrapping

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            Model (Model): return a new Model() object
        """
        try:

            # default creator attributes
            self.webg_path = str()
            self.localg_path = str()
            self.imgd_path = str()
            self.schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA)
            self.data_frame = pd.DataFrame(columns=DEFAULT_FRAME_SCHEMA)
            self.wpage = Page()

            # when arguments are pass as parameters
            if len(args) > 0:
                for arg in args:
                    # URL of the remote gallery to scrap
                    if args.index(arg) == 0:
                        self.webg_path = arg

                    # local dirpath to save the gallery CSV
                    if args.index(arg) == 1:
                        self.localg_path = arg

                    # local dirpath to save the images
                    if args.index(arg) == 2:
                        self.imgd_path = arg

                    # dataframes containing the data of the gallery
                    if args.index(arg) == 3:
                        self.data_frame = arg

            # if there are dict decrators in the creator
            if len(kwargs) > 0:

                for key in list(kwargs.keys()):

                    # updating schema in the model
                    if key == "schema":
                        self.schema = copy.deepcopy(kwargs[key])
                        self.data_frame = pd.DataFrame(columns=self.schema)

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: __init__")

    # =========================================
    # Index functions
    # =========================================

    def scrapidx(self, gurl, stime, div, attrs):
        """
        Scrap the gallery index and recover all the elements in it

        Args:
            gurl (str): gallery URL to scrap data
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine
            the search and scrap
            stime (float): waiting time between requests

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): div and attrs filtered beatifulsoup object
        """
        try:
            # reset working web page
            self.wpage = Page()
            ans = None

            # getting the basic element list from gallery online index
            self.wpage.get_collection(gurl, stime)
            ans = self.wpage.findin(div, attributes=attrs)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: scrapidx")

    def scrapagn(self, div, attrs):
        """
        Using the scrapidx() results, scrap for new information
        to complete the dataframe index

        Args:
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): div and attrs filtered beatifulsoup object
        """
        try:
            ans = None
            ans = self.wpage.findin(div, attributes=attrs)
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: scrapagn")

    def newidx(self, cols, data):
        """
        creates a new dataframe in the model based on the columns
        names and new data.

        Args:
            columns (list): list of column names to create the new dataframe
            data (list:list, pandas/numpy matrix): data for the columns the
            new dataframe

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bool): true if the function created a new df-frame,
            false otherwise
        """
        try:
            ans = False
            self.data_frame = pd.DataFrame(columns=self.schema)

            for col, td in zip(cols, data):

                self.data_frame[col] = td
                ans = True

            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: newidx")

    def get_idxid(self, gsoup, ide, clean):
        # TODO: remove after implement the Topic() class
        """
        get the unique identifier (ID) of the gallery elements (paints) and
        list them to introduce them itto the dataframe

        Args:
            gsoup (bs-obj): list with gallery elements in Beatiful Soup format
            ide (str): HTML <div> keyword to extract the element (paint) ID

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list with the elements (paints) IDs
        """
        try:

            ans = list()

            for element in gsoup:

                tid = element.get(ide).replace(clean, "")
                ans.append(tid)

            # returning answer
            return ans

            # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: get_idxid")

    def get_idxurl(self, gsoup, rurl, urle):
        # TODO: remove after implement the Topic() class
        """
        get the list of the elements inside the gallery index based on the root
        domain url and html div tags

        Args:
            gsoup (bs-obj): beatifulSoup object containing the gallery's
            element list
            rurl (str): root URL of the domain to complete the element url
            urle (str): HTML <div> keyword to process the Page's scraped
            gallery urls

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list with each of the gallery's unique urls
        """
        try:

            ans = list()

            for title in gsoup:

                turl = urllib.parse.urljoin(rurl, title.get(urle))
                ans.append(turl)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: get_idxurl")

    def get_idxtitle(self, gsoup, etitle):
        # TODO: remove after implement the Topic() class
        """
        get the element titles from the gallery main page

        Args:
            gsoup (bs-obj): beatifulSoup object containing the gallery's
            element list
            etitle HTML <div> keyword to process the scraped data from
            the gallery's soup to get the element titles

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): gallery element (paints) titles in string
        """
        try:

            ans = list()
            for element in gsoup:
                # default unknown element name

                title = "untitled"

                # if we know the name of the element
                if element.get(etitle) is not None:
                    title = element.get(etitle)

                # update the answer
                ans.append(title)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: get_idxtitle")

    # =========================================
    # Scrap columns functions in Index
    # =========================================

    def scrape(self, eurl, div, attrs, **kwargs):
        """
        scrap elements within a link based on the <div>, html marks
        and other attributes or decoratos

        Args:
            eurl (str): gallery's element url
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): HTML divs as a beatifulsoup object
        """
        try:

            # reset working web page
            self.wpage = Page()

            # get the body of the element url
            rstatus = self.wpage.get_body(eurl)
            ans = None

            if rstatus == 200:
                # find element inside the html body
                ans = self.wpage.findin(div,
                                        attributes=attrs,
                                        multiple=kwargs.get("multiple"))
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: scrape")

    def get_imgfn(self, eurl, div, attrs):
        """
        scrap elements within a link based on the <div>, html marks
        and other attributes or decoratos

        Args:
            eurl (str): gallery's element url
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): HTML divs as a beatifulsoup object
        """
        try:

            # reset working web page
            self.wpage = Page()

            # get the headers and the content from the url
            rstatus = self.wpage.get_header(eurl)
            rstatus = self.wpage.get_content()

            ans = str()

            if rstatus == 200:
                # find attribute inside the headers
                if attrs.items() <= self.wpage.shead.items():
                    headers = self.wpage.shead
                    ans = headers.get(div)
                    ans = str(ans)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: XXXXX")

    def clean_imgfn(self, text, elem, clean):
        """
        scrap elements within a link based on the <div>, html marks
        and other attributes or decoratos

        Args:
            text (str): text to be clean
            elem (str): keyword to split the str and process
            clean (str): keyword to clean in the text

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (str): clean file name with extension
        """
        try:

            ans = None
            ans = text.split(elem)[1].strip().strip(clean)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: clean_imgfn")

    def get_imgf(self, gfolder, dlurl, pfn):
        # TODO: remove after implement the Topic() class
        """
        save the paint file from the asset URL in the local folder path

        Args:
            gfolder (str): root local dirpath where the file is going to be
            save
            dlurl (str): url address with the downlodable image file
            pfn (str): filename to save the image

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bool): True if the file was downloaded in the local dirpath,
            False if not
        """
        try:
            # default answer
            ans = False

            # parsing the URL to choose the local folder to save the file
            imgf = urllib.parse.urlparse(dlurl)
            imgf = imgf.path.split("/")[len(imgf.path.split("/")) - 1]
            fp = os.path.join(gfolder, imgf, pfn)

            # if the file doesnt exists
            if not os.path.exists(fp):

                # saving file from content requests in bit form
                data = self.wpage.content
                with open(fp, "wb") as file:

                    file.write(data)
                    file.close()
                    ans = True
                    return ans

            # if the file already exists
            elif os.path.exists(fp):

                ans = True
                return ans

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: get_imgf")

    def updata(self, column, data):
        """
        updates a single column with new data, the size of the data needs to be
        the same as the existing records

        Args:
            column (str): name of the column in the dataframe to update
            data (list/np.array): dataframe of the data to update

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dataframe.info()): updated pandas dataframe description
        """
        try:
            ans = False
            self.data_frame[column] = data
            if self.data_frame[column] is not None:
                ans = True
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: updata")

    # =========================================
    # consult functions
    # =========================================

    def getdata(self, column):
        """
        gets the data from a given column name, returning a list

        Args:
            column (str): name of the column in the dataframe to update

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): formated copy of the data in the dataframe
        """
        try:

            ans = copy.deepcopy(self.data_frame[column])
            ans = list(ans)
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: getdata")

    def check_gallery(self):
        """
        checks the state of the model's dataframe

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dataframe.info()): pandas dataframe description
        """
        try:
            self.data_frame.info()
            # return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: check_gallery")

    # =========================================
    # update functions
    # =========================================
    def upindex(self, column, data):
        """
        updates a single column according to its index/name in the dataframe

        Args:
            column (str): column name in the dataframe
            data (list): list with the updated data for the pandas dataframe,
            needs to have the same size of the original

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dataframe.info()): pandas dataframe description
        """
        try:

            self.data_frame[column] = data
            ans = self.data_frame.info()
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: upindex")

    # =========================================
    # I/O functions
    # =========================================

    def save_gallery(self, fn, dfolder):
        """
        save the in memory dataframe into a CSV file with UTF-8 encoding

        Args:
            fn (str): file name with .csv extension
            dfolder (file-object): valid dirpath str or array with
            valid folders.

        Raises:
            exp: raise a generic exception if something goes wrong
        """
        try:
            # pandas function to save dataframe in CSV file
            ans = False
            gfp = os.path.join(os.getcwd(), dfolder, fn)
            tdata = self.data_frame.to_csv(gfp,
                                           sep=",",
                                           index=False,
                                           encoding="utf-8",
                                           mode="w",
                                           quoting=csv.QUOTE_ALL)
            if tdata is None:
                ans = True
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: save_gallery")

    def load_gallery(self, fn, dfolder):
        """
        loads the gallery from a CSV file in UTF-8 encoding

        Args:
            fn (str): file name with .csv extension
            dfolder (file-object): valid dirpath str or array with
            valid folders.

        Raises:
            exp: raise a generic exception if something goes wrong
        """
        try:
            # read an existing CSV fileto update the dataframe
            ans = False
            gfp = os.path.join(os.getcwd(), dfolder, fn)
            self.data_frame = pd.read_csv(gfp,
                                          sep=",",
                                          encoding="utf-8",
                                          engine="python",
                                          quoting=csv.QUOTE_ALL)
            if self.data_frame is not None:
                ans = True
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: load_gallery")

    def export_imgs(self, sfpn, tfpn, tsufix):
        """
        Export images from source files into target files with CV2

        Args:
            sfpn (list): local filepaths of source images
            tfpn (list): local filepaths of target images
            tsufix (dict): target image file sufix, ie.: "-rgb"

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dict): relative filepaths for the target images
        """
        try:
            # default answer
            ans = dict()
            wans = dict()
            for key in tsufix.keys():
                wans[key] = str()

            # checking if both list have images
            eq1 = (len(sfpn) > 0) and (len(tfpn) > 0)
            # checking if the target files and the keys equal
            eq2 = (len(tfpn) == len(tsufix.keys()))

            # evaluating both conditions
            if eq1 and eq2:

                # iterating in the source files
                for sf in sfpn:

                    # iterating in the target files paths and keys
                    for tf, key in zip(tfpn, tsufix.keys()):
                        # default temporal variables
                        complete = False
                        tdf = None

                        # checking if is RGB
                        # if any("rgb" in s for s in (tf, key)):
                        if "rgb" in tf:
                            # opening the source file
                            tdf = cv2.imread(sf, cv2.IMREAD_UNCHANGED)
                            # exporting/saving to RBG file
                            complete = cv2.imwrite(tf, tdf)

                        # checking if is B&W
                        # elif any("bw" in s for s in (tf, key)):
                        elif "bw" in tf:
                            # opening the source file
                            tdf = cv2.imread(sf, cv2.IMREAD_GRAYSCALE)
                            # convert = cv2.COLOR_BGR2GRAY
                            # tdf = cv2.cvtColor(tdf, convert)
                            # exporting/saving to B&W file
                            complete = cv2.imwrite(tf, tdf)

                        # updating answer dict
                        if complete is True:
                            # recovering the important relative path
                            tf = os.path.normpath(tf)
                            tf = tf.split(os.sep)
                            tf = tf[len(tf) - 4:len(tf)]
                            tf = os.path.join(*tf)
                            td = {key: tf}
                            wans.update(td)

            # returning answer
            ans = copy.deepcopy(wans)
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: export_imgs")

    def export_shapes(self, tfpn, tsufix):
        """
        Export images from source files into target files with CV2

        Args:
            tfpn (list): local filepaths of target images
            tsufix (dict): target image file sufix, ie.: "-rgb"

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dict): relative filepaths for the target images
        """
        try:
            # default answer
            ans = dict()
            wans = dict()
            for key in tsufix.keys():
                wans[key] = str()

            # checking if list have images
            if len(tfpn) > 0:

                # checking if the target files and the keys equal
                if len(tfpn) == len(tsufix.keys()):

                    # iterating ordered keys
                    sort_sufix = sorted(tsufix.keys(), reverse=False)
                    for tf, key in zip(tfpn, sort_sufix):
                        tf = str(tf)
                        # default temporal variables
                        tdf = None
                        complete = False
                        tshape = list()

                        # checking if it is RGB
                        # if any("rgb" in s for s in (tf, key)):
                        if "rgb" in tf:
                            # opening file in RBG
                            tdf = cv2.imread(tf, cv2.IMREAD_UNCHANGED)
                            # exporting/saving to RBG shape
                            tshape = list(tdf.shape)
                            complete = True

                        # checking if it is B&W
                        # elif any("bw" in s for s in (tf, key)):
                        if "bw" in tf:
                            # opening file in B&W
                            tdf = cv2.imread(tf, cv2.IMREAD_GRAYSCALE)
                            # exporting/saving to B&W shape
                            tshape = list(tdf.shape)
                            complete = True

                        # updating answer dict
                        if complete is True:
                            td = {key: tshape}
                            wans.update(td)

            # # returning answer
            ans = copy.deepcopy(wans)
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: export_shapes")

    def get_srcimgs(self, sfp, sfext):
        """
        Recover the images inside the localpath using the file extension

        Args:
            sfp (str): local folderpath of the source image to scan
            sfext (str): source image file extension, ie.: "jpg"

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list of the source images local filepaths
        """
        try:
            # default answer
            ans = list()
            files = os.listdir(sfp)

            # cheking if there is files in folder
            if len(files) > 0:
                # finding the proper image extension file
                for f in files:
                    if f.endswith(sfext):
                        fn = os.path.join(sfp, f)
                        ans.append(fn)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: get_srcimgs")

    def set_tgtimgs(self, sfpn, tfp, tfext, tsufix):
        """
        Creates the target images in the localpath using the file
        extensions

        Args:
            sfpn (list): source local filepaths of images
            tfp (str): target local folderpath to set the images
            tfext (dict): target image file extension, ie.: "jpg"
            tsufix (dict): target image file sufix, ie.: "-rgb"

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list of the target images local filepaths
        """
        try:
            # default answer
            ans = list()

            # checking if source folder has viable files
            if len(sfpn) > 0:

                # checking source file list
                for sf in sfpn:
                    # recover the source file
                    sfn = os.path.split(sf)
                    sfn = sfn[len(sfn) - 1]
                    # strip from original file ext
                    sfn = sfn.split(".")[0]

                    # creating target files with sufix and extension
                    for te, ts in zip(tfext.keys(), tsufix.keys()):
                        # specific target filename + extension
                        tfn = sfn + tsufix.get(ts) + "." + tfext.get(te)
                        tfn = os.path.join(tfp, tfn)
                        ans.append(tfn)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: set_tgtimgs")

    # =========================================
    # clean scraped information functions
    # =========================================

    def clean_description(self, soup, elem, clean):
        # TODO: remove after implement the Topic() class
        """
        Clean the page's description from the beatifulSoup object

        Args:
            soup (bs-obj): beatifulSoup object with the description data
            elem (str): HTML <div> keyword to scrap the description data
            clean (list): secondary <div> to clean the description data

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dict): Element (paint) clean description
        """
        try:
            # get the title in the painting page
            ans = dict()

            # some pages dont follow the most commond diagram
            if soup is not None:

                if len(soup) > 0:

                    # finding title <h1> in the soup
                    value = soup[0].find(elem[0])
                    # cleaning data
                    key = value.attrs.get(clean[0])[0]
                    key = str(key).replace(clean[1], "", 1)
                    key = self.clrtext(key)

                    value = str(value.string).strip()
                    value = self.clrtext(value)

                    # creating the dict to return to save as JSON
                    td = {key: value}
                    # updating answer dict
                    ans.update(copy.deepcopy(td))

                    # finding all description paragraphs <p> in the soup
                    description = soup[0].findAll(elem[1])
                    for element in description:

                        key = element.attrs.get(clean[0])[0]
                        key = str(key)
                        key = key.replace(clean[1], "", 1)
                        key = self.clrtext(key)

                        value = str(element.string).strip()
                        value = self.clrtext(value)

                        # creating the dict to return to save as JSON
                        td = {key: value}

                        # updating answer dict
                        ans.update(copy.deepcopy(td))

                    # getting description text section
                    key = soup[1]
                    key = key.attrs.get(clean[0])[0]
                    key = str(key)
                    key = key.replace(clean[1], "", 1)
                    key = self.clrtext(key)

                    # getting section description text
                    text = soup[1].find(elem[1])
                    value = str()
                    for txt in text:
                        txt = txt.string
                        txt = str(txt)
                        value = value + txt

                    # cleaning data
                    value = str(value).strip()
                    value = self.clrtext(value)

                    # updating answer dict
                    td = {key: value}
                    ans.update(copy.deepcopy(td))

                    # finding all the related links in the description
                    links = soup[1].findAll(elem[2])
                    for link in links:
                        key = str(link.string)
                        key = self.clrtext(key)

                        # getting the link URL
                        value = link.get(clean[2])
                        # reconstructing all the url from the page
                        value = str(value)
                        td = {key: value}

                        # creating the dict to return to save as JSON
                        td = {key: value}

                        # updating answer dict
                        ans.update(copy.deepcopy(td))

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: clean_description")

    def clean_searchtags(self, rurl, soup, elem, clean):
        # TODO: remove after implement the Topic() class
        """
        Clean the page's search-tags from the beatifulSoup object

        Args:
            rurl (str): root URL of the domain to complete the search-tags
            soup (bs-obj): beatifulSoup object with the search-tags data
            elem (str): HTML <div> keyword to scrap the search-tags data
            clean (str): secondary <div> keyword to clean the data from
            the scrap

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dict): Element (paint) clean search-tags
        """
        try:
            # default answer
            ans = dict()

            # checking if searchtags exists
            if soup is not None:

                # checking is the correct collection search tags
                if len(soup) > 0:

                    # finding searhtags <a> in the sou
                    tags = soup[0].findAll(elem)

                    # processing the search tags
                    if len(tags) > 0 and isinstance(tags, list) is True:

                        for tag in tags:
                            # cleaning data
                            key = str(tag.string)
                            key = self.clrtext(key)
                            url = tag.get(clean)

                            # reconstructing all the url from the page
                            value = str(urllib.parse.urljoin(rurl, url))
                            td = {key: value}

                            # updating answer dict
                            ans.update(copy.deepcopy(td))

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: clean_searchtags")

    def clean_objdata(self, soup, elem):
        # TODO: remove after implement the Topic() class
        """
        Clean the page's object-data from the beatifulSoup object

        Args:
            soup (bs-obj): beatifulSoup object with the object-data data
            elem (str): HTML <div> keyword to scrap the object-data data

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dict): Element (paint) clean object-data
        """
        try:
            # default answer
            ans = dict()

            # checking if object-data exists
            if soup is not None:

                # finding <dt> and <dd> from the soup
                keys = soup.findAll(elem[0])
                values = soup.findAll(elem[1])

                # soup keys and values must have data
                if len(keys) > 0 and len(values) > 0:

                    # looping over the <dt> and <dd> data
                    for key, value in zip(keys, values):

                        # cleaning data for dictionary
                        key = str(key.string)
                        key = self.clrtext(key)

                        value = str(value.string)
                        value = self.clrtext(value)

                        # temp dict for complete answer
                        td = {key: value}
                        # updating answer dict
                        ans.update(copy.deepcopy(td))

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: clean_objdata")

    def clean_relwork(self, rurl, soup, elem, clean):
        # TODO: remove after implement the Topic() class
        """
        process the scraped data from the beatifulSoup object and saves the
        related work information into a JSON files

        Args:
            rurl (str): domain root URL to complete the related-work link
            soup (bs-obj): beatifulSoup object with the related-work data
            elem (str): HTML <div> keyword to scrap the related-work data
            clean (list): secondary <div> to clean the related-work data

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dict): Element (paint) clean related-work
        """
        try:
            # default answer
            ans = dict()

            # checking if searchtags exists
            if soup is not None:

                # finding searhtags <article> in the sou
                relworks = soup[0].findAll(elem)

                # processing related work
                i = 1
                for rw in relworks:
                    # cleaning data and getting all keys and values
                    key = str(rw.find(clean[0]).string)
                    key = self.clrtext(key)

                    url = rw.find(clean[1])
                    url = url.get(clean[2])
                    value = str(urllib.parse.urljoin(rurl, url))

                    # may names are similar in related work
                    if key in ans.keys():

                        # creating alternate key for the dict
                        key = key + " " + str(i)
                        i += 1

                    # updating answer dict
                    td = {key: value}
                    ans.update(copy.deepcopy(td))

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: clean_relwork")

    def clean_dlurl(self, gsoup, rurl, urle):
        # TODO: remove after implement the Topic() class
        """
        recovers the download URL for a gallery element

        Args:
            gsoup (bs-obj): beatifulSoup object with gallery element list
            rurl (str): domain root URL to complete the gallery index
            urle (str): HTML <div> keyword to scrap the gallery index
            urls to download files

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (str): unique URL with the downloadable element's file
        """
        try:
            ans = None

            if gsoup is not None:
                url = gsoup.get(urle)
                ans = urllib.parse.urljoin(rurl, url)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: clean_dlurl")

    def clrtext(self, text):
        # TODO: remove after implement the Topic() class
        """
        clean text from HTML, remove all inconvinient characters such as:
        extra spaces, extra end-of-line, and non utf-8 characters

        Args:
            text (str): text to clean

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans(str): clean text
        """
        try:
            # asigning text as ans
            ans = str(text)

            # attempt striping
            ans = ans.strip()

            # fix encoding
            ans = unicodedata.normalize('NFD', ans)
            ans = ans.encode('ascii', 'ignore')
            ans = ans.decode("utf-8")
            ans = str(ans)

            # removing extra spaces
            ans = re.sub(r" \s+", " ", ans)
            # removing newlines
            ans = re.sub(r"\n", ". ", ans)
            # remove pesky single quote
            ans = re.sub(r"'", "", ans)
            # HTML weird leftovers
            ans = re.sub(r"None{1,3}", " ", ans)

            # final cast and rechecking
            ans = str(ans)
            # ans = re.sub(r"\W", " ", ans)
            ans = re.sub(r" \s+", " ", ans)

            # return answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Gallery: clrtext")

Exemplo n.º 11

0

Exibir arquivo

Arquivo: Controller.py Projeto: sa-artea/VVG-Gallery-Scrapy

class Controller():
    """
    Controller class, comunicate the View() and the Model(), it also
    manage file Input/Output

    The controller mediates between the view and the model, there are
    some operations implemented in this class, specially the load and
    save functions as well as functions to merge the results from
    different elements in the models or various models.
    """

    # =========================================
    # class variables
    # =========================================
    webg_path = str()
    localg_path = str()
    imgd_path = str()
    schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA)
    gallery = Gallery()
    wpage = Page()

    # =========================================
    # class creator
    # =========================================

    def __init__(self, *args, **kwargs):
        """
        Controller() class creator

        Args:
            webg_path (str): URL for the gallery to scrap data
            localg_path (str): local dirpath for the gallery data
            schema (list): array with the column names for the model
            gallery (Gallery): object with the gallery dataframe model
            # wpage (Page): the current webpage the controller is scrapping

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            Controller (Model): return a new Controller() object
        """

        try:
            # Controller default values
            self.webg_path = str()
            self.localg_path = str()
            self.imgd_path = str()
            self.schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA)
            self.gallery = Gallery()
            self.wpage = Page()

            # when arguments are pass as parameters
            if len(args) > 0:
                i = 0
                for i in range(int(len(args))):

                    # URL of the remote gallery to scrap
                    if i == 0:
                        self.webg_path = args[i]

                    # local dirpath to save the gallery CSV
                    if i == 1:
                        self.localg_path = args[i]

                    # painting list containing the data of the gallery
                    if i == 2:
                        self.imgd_path = args[i]

            # if there are dict decrators in the creator
            if len(kwargs) > 0:

                for key in list(kwargs.keys()):

                    # updating schema in the controller
                    if key == "schema":
                        self.schema = copy.deepcopy(kwargs[key])

                    # setting the max size of the gallery
                    if key == "model":
                        self.gallery = kwargs[key]

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: __init__")

    # =========================================
    # Config local folder functions
    # =========================================

    def setup_local(self, *args):
        """
        Set up local gallery filepath acording to the root gallery folder and
        other subfolders

        Args:
            rootf (str): name of the main gallery local folder
            subfolders (list, optional): the subfolders names to the gallery
            conforming the absolute dirpath

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            wpath (str): returns the local filepath to the gallery
        """
        try:

            # answer with realpath local subfoders
            wpath = str()
            wpath = os.path.join(*args)

            # if the path doesnt exists you create it
            if not os.path.exists(wpath):

                os.makedirs(wpath)

            return wpath

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: setup_local")

    def create_localfolders(self, *args):
        """
        Creates local subfolders with the gallery folder as root for them

        Args:
            gfolder (str): name of the main gallery folder
            coln (str): name of the ID column to create the folders

        Raises:
            exp: raise a generic exception if something goes wrong
        """
        try:

            gfolder = args[0]
            coln = args[1]

            # looping throught ID list as folder names for the local gallery
            for folder in self.getdata(coln):

                # create the local folder path to create if necessary
                tfp = os.path.join(gfolder, folder)

                # if the local folder doesnt exists
                if not os.path.exists(tfp):
                    os.makedirs(tfp)

                # the local forlder already ecists
                elif os.path.exists(tfp):
                    pass

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: create_localfolders")

    # =========================================
    # Index functions
    # =========================================

    def scrapidx(self, gurl, stime, div, attrs):
        """
        Scrap the gallery, create a new index and recover all elements in it

        Args:
            gurl (str): URL for the gallery to scrap data
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine
            the search and scrap
            stime (float): waiting time between requests

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): div and attrs filtered beatifulsoup object
        """
        try:
            gm = self.gallery
            ans = gm.scrapidx(gurl, stime, div, attrs)
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: scrapidx")

    def scrapagn(self, div, attrs):
        """
        Scrap for new information and complete the dataframe index after
        executing the scrapidx() function

        Args:
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bs-obj): div and attrs filtered beatifulsoup object
        """
        try:
            gm = self.gallery
            ans = gm.scrapagn(div, attrs)
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: scrapagn")

    def get_idxid(self, gsoup, ide, clean):
        """
        get the unique identifier (ID) of the gallery elements (paints) and
        list them to introduce them itto the dataframe

        Args:
            gsoup (bs-obj): list with gallery elements in Beatiful Soup format
            ide (str): HTML <div> keyword to extract the element (paint) ID

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list with the elements (paints) IDs
        """
        try:
            gm = self.gallery
            ans = gm.get_idxid(gsoup, ide, clean)
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: get_idxid")

    def get_idxurl(self, gsoup, rurl, urle):
        """
        Get the list of the elements inside the gallery index based on the root
        domain url and html div tags

        Args:
            gsoup (bs-obj): beatifulSoup object containing the gallery's
            element list
            rurl (str): root URL of the domain to complete the element url
            urle (str): HTML <div> keyword to process the Page's scraped
            gallery urls

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list with each of the gallery's unique urls
        """
        try:

            gm = self.gallery
            ans = gm.get_idxurl(gsoup, rurl, urle)
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: get_idxurl")

    def get_idxtitle(self, gsoup, etitle):
        """
        Get the element titles from the gallery main page

        Args:
            gsoup (bs-obj): beatifulSoup object containing the gallery's
            element list
            etitle HTML <div> keyword to process the scraped data from
            the gallery's soup to get the element titles

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): gallery element (paints) titles in string
        """
        try:

            gm = self.gallery
            ans = gm.get_idxtitle(gsoup, etitle)
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: get_idxtitle")

    # =========================================
    # Scrap columns functions from Index
    # =========================================

    def scrap_descriptions(self, *args, **kwargs):
        """
        Scrap the elements (paints) description in the index using the
        ID column name, HTML divisions <divs>, decorative attributes,
        secondary HTML elements and cleaning HTML divisions

        Args:
            coln (str): ID column name of the gallery dataframe
            div (str): HTML <div> search and scrap keyword
            attrs (dict): decorative <div> keywords to refine the scrap
            elem (str): secondary <div> keyword to refine the search
            and scrap process
            clean (list): secondary <div> keywords to clean the data
            obtained from the scrap

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list of element descriptions in JSON format
        """
        try:

            # get the url list from the dataframe in the model
            ans = list()
            gm = self.gallery
            coln = args[0]
            div = args[1]
            attrs = args[2]
            elem = args[3]
            clean = args[4]

            for url in self.getdata(coln):

                tsoup = gm.scrape(url, div, attrs, **kwargs)
                tans = gm.clean_description(tsoup, elem, clean)

                # compose answer
                tans = self.to_json(tans)
                ans.append(tans)
                time.sleep(DEFAULT_SLEEP_TIME)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: scrap_descriptions")

    def scrap_paintlinks(self, *args, **kwargs):
        """
        scrap the data to download the painting file using the ID column name
        and the domain root URL

        Args:
            coln (str): ID column name of the gallery dataframe
            rurl (str): domain root URL to download the elements
            div (str): HTML <div> search and scrap keyword
            attrs (dict): decorative <div> keywords to refine the scrap
            elem (str): secondary <div> keyword to refine the search
            and scrap process

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list of the URLs (HTTP) to download the elements
        """
        try:
            # default answer
            ans = list()
            gm = self.gallery
            coln = args[0]
            rurl = args[1]
            div = args[2]
            attrs = args[3]
            elem = args[4]

            # getting the element url in the gallery
            for url in self.getdata(coln):

                # scraping elements each gallery page
                tsoup = gm.scrape(url, div, attrs, **kwargs)
                tans = gm.clean_dlurl(tsoup, rurl, elem)

                # compose answer
                ans.append(tans)
                time.sleep(DEFAULT_SLEEP_TIME)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: scrap_paintlinks")

    def dlpaints(self, *args):
        """
        download the paint files from the list of available asset url
        in the gallery

        Args:
            dlurl_coln (str): column name of known download URLs
            gfolder (str): name of the main gallery folder
            div (str): HTML <div> search and scrap keyword
            attrs (dict): decorative <div> keywords to refine the scrap
            elem (str): secondary <div> keyword to refine the search
            and scrap process

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list of boolean marking if it is possible to
            download a picture file or not
        """
        try:
            # getting the element url in the gallery
            ans = list()
            gm = self.gallery
            dlurl_coln = args[0]
            gf = args[1]
            div = args[2]
            attrs = args[3]
            elem = args[4]
            clean = args[5]

            for url in self.getdata(dlurl_coln):

                # the url is valid, it can be null or na or none
                if validators.url(str(url)) is True:

                    # recovers the image file name
                    tsoup = gm.get_imgfn(url, div, attrs)
                    # clean the name to save
                    timgf = gm.clean_imgfn(tsoup, elem, clean)
                    # download and save the image in the local folder
                    tans = gm.get_imgf(gf, url, timgf)
                    ans.append(tans)

                # invalid url
                else:
                    tans = False
                    ans.append(tans)

                time.sleep(DEFAULT_SLEEP_TIME)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: dlpaints")

    def scrap_searchtags(self, *args, **kwargs):
        """
        Scrap the elements (paints) search-tags using the ID column name
        in the index, the domain URL, HTML divisions <divs>, decorative
        attributes, secondary HTML elements and cleaning HTML divisions

        Args:
            coln (str): ID column name of the gallery dataframe
            rurl (str): root URL of the domain to complete the search-tags
            div (str): HTML <div> keyword to scrap the search-tags
            attrs (dict): decorative attributes in the <div> keyword to refine
            elem (str): element is a secondary <div> keyword to refine the
            search and scrap process
            clean (str): secondary <div> keyword to clean the data from
            the scrap

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list of element search-tags in JSON format
        """
        try:
            # get the url list from the dataframe in the model
            ans = list()
            gm = self.gallery
            coln = args[0]
            rurl = args[1]
            div = args[2]
            attrs = args[3]
            elem = args[4]
            clean = args[5]

            for url in self.getdata(coln):
                # scraping elements each gallery page
                tsoup = gm.scrape(url, div, attrs, **kwargs)
                # extracting the search tags from the soup
                tans = gm.clean_searchtags(rurl, tsoup, elem, clean)

                # compose answer
                tans = self.to_json(tans)
                ans.append(tans)
                time.sleep(DEFAULT_SLEEP_TIME)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: scrap_searchtags")

    def scrap_objdata(self, *args, **kwargs):
        #     def scrap_objdata(self, coln, div, attrs, elem, **kwargs):
        """
        Scrap the elements (paints) object-data using the ID column name
        in the index, HTML divisions <divs>, decorative attributes,
        secondary HTML elements and cleaning HTML divisions

        Args:
            coln (str): ID column name of the gallery dataframe
            div (str): HTML <div> keyword to scrap the object-data
            attrs (dict): decorative attributes in the <div> keyword to refine
            elem (str): element is a secondary <div> keyword to refine the
            search and scrap process
            clean (str): secondary <div> keyword to clean the data from
            the scrap

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): list of element object-data in JSON format
        """
        try:
            # get the url list from the dataframe in the model
            ans = list()
            gm = self.gallery
            coln = args[0]
            div = args[1]
            attrs = args[2]
            elem = args[3]
            for url in self.getdata(coln):

                tsoup = gm.scrape(url, div, attrs, **kwargs)
                tans = gm.clean_objdata(tsoup, elem)

                # compose answer
                tans = self.to_json(tans)
                ans.append(tans)
                time.sleep(DEFAULT_SLEEP_TIME)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: scrap_objdata")

    def scrap_relwork(self, *args, **kwargs):
        """
        able to scrap the related work data from the webpage using the
        dataframe's column name, the HTML divs and other decorators in the url

        Args:
            coln (str): ID column name of the gallery dataframe
            rurl (str): root URL of the domain to complete the related work
            div (str): HTML <div> keyword to search and scrap
            attrs (dict): decorative attributes in the <div> keyword to refine
            elem (str): element is a secondary <div> keyword to refine the
            search and scrap process

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): the list of the related work recovered from the
            gallery elements
        """
        try:
            # get the url list from the dataframe in the model
            ans = list()
            gm = self.gallery
            coln = args[0]
            rurl = args[1]
            div = args[2]
            attrs = args[3]
            elem = args[4]
            clean = args[5]

            for url in self.getdata(coln):

                # scraping elements each gallery page
                tsoup = gm.scrape(url, div, attrs, **kwargs)

                # # default empty dict to return
                tans = dict()

                # checking if there is any related work to process
                if len(tsoup) > 0:

                    # extracting the search tags from the soup
                    tans = gm.clean_relwork(rurl, tsoup, elem, clean)

                # compose answer
                tans = self.to_json(tans)
                ans.append(tans)
                time.sleep(DEFAULT_SLEEP_TIME)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: scrap_relwork")

    def export_paints(self, *args):
        """
        Export the images from a source folder into a target folder,
        the target images are in color and in grayscale

        Args:
            coln (str): ID column name of the gallery dataframe
            sfext (str): source image file extension, ie.: "jpg"
            tfext (dict): target image file extension, ie.: "jpg"
            tsufix (dict): target image file sufix, ie.: "-rgb"

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): the list of dict with the relative localpath
            file for each gallery element
            (ej.: {"rgb": "/Data/Img/s0004V1962r-rgb.jpg",
                    "bw": "/Data/Img/s0004V1962r-b&w.jpg"
                    })
        """
        try:
            # default answer
            ans = list()
            # working variables
            coln = args[0]
            sfext = args[1]
            tfext = args[2]
            tsufix = args[3]
            gm = self.gallery

            # iterating over the index data
            for tid in self.getdata(coln):
                # config source and target folders
                srcf = os.path.join(self.localg_path, tid)
                tgtf = os.path.join(self.imgd_path, tid)

                # recovering source images
                srcfn = gm.get_srcimgs(srcf, sfext)
                # setting target images
                tgtfn = gm.set_tgtimgs(srcfn, tgtf, tfext, tsufix)
                # exporting images
                tans = gm.export_imgs(srcfn, tgtfn, tsufix)

                # compose answer
                tans = self.to_json(tans)
                ans.append(tans)
                time.sleep(DEFAULT_SHORT_SLEEP_TIME)

            # return answer list
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: export_paints")

    def export_shapes(self, *args):
        """
        Export the image shapes from the exported images in the target folder

        Args:
            coln (str): ID column name of the gallery dataframe
            sfext (str): source image file extension, ie.: "jpg"
            tfext (dict): target image file extension, ie.: "jpg"
            tsufix (dict): target image file sufix, ie.: "-rgb"

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list) the list of dict with the shape of each
            gallery element
            (ej.: {"rgb": (450, 280, 3),
                    "bw": (450, 280)})
        """
        try:
            # default answer
            ans = list()
            # working variables
            coln = args[0]
            tfext = args[1]
            tsufix = args[2]

            gm = self.gallery
            ip = self.imgd_path

            # iterating over the index data
            for tid in self.getdata(coln):

                # config source and target folders
                tgtf = os.path.join(ip, tid)
                # recovering source images
                tgtfn = gm.get_srcimgs(tgtf, tfext)
                # exporting shapes
                tans = gm.export_shapes(tgtfn, tsufix)

                # compose answer
                tans = self.to_json(tans)
                ans.append(tans)
                time.sleep(DEFAULT_SHORT_SLEEP_TIME)

            # return answer list
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: export_shapes")

    def getdata(self, coln, *args, **kwargs):
        """
        get the data based in the column name of the model's dataframe

        Args:
            coln (str): column name of the gallery dataframe to get

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (list): data from the column name
        """
        try:
            # getting the element url in the gallery
            ans = list()
            gm = self.gallery
            ans = gm.getdata(coln, *args, **kwargs)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: getdata")

    # =========================================
    # dataframe CRUD functions
    # =========================================

    def newdf(self, columns, data):
        """
        creates a new model dataframe with

        Args:
            columns (list): list of columns names for the new dataframe
            data (dataframe): new dataframe data, it can be empty

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (bool): true if the function created a new df-frame,
            false otherwise
        """
        try:
            gm = self.gallery
            ans = gm.newidx(columns, data)
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: newdf")

    def updata(self, column, data):
        """
        update the data in one column of the gallery model (dataframe)

        Args:
            column (str): model column name to update
            data (list): new data to update in the column, must be of the same
            the dataframe column

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dataframe.info()): pandas description of dataframe
        """
        try:
            gm = self.gallery
            ans = gm.updata(column, data)
            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: updata")

    def save_gallery(self, fname, folder):
        """
        write the gallery model (pandas) into a CSV file

        Args:
            fname (str): file name to write the gallery model
            folder (str): subfolder to write the CSV file

        Raises:
            exp: raise a generic exception if something goes wrong
        """
        try:
            gm = self.gallery
            ans = gm.save_gallery(fname, folder)
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: save_gallery")

    def load_gallery(self, fname, folder):
        """
        read the gallery model (pandas) from a CSV file

        Args:
            fname (str): file name from where to read the gallery model
            folder (str): subfolder from where to read the CSV file

        Raises:
            exp: raise a generic exception if something goes wrong
        """
        try:
            gm = self.gallery
            ans = gm.load_gallery(fname, folder)
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: load_gallery")

    def check_gallery(self):
        """
        checks the data stats of the gallery dataframe

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (dataframe.info()): pandas description of dataframe
        """
        try:
            gm = self.gallery
            gm.check_gallery()
            # return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: check_gallery")

    # =========================================
    # dataframe I/O functions
    # =========================================

    def export_json(self, gfolder, incol, expcol, fname):
        """
        export the data from one column in the model's dataframe into JSON file
        in an specific local gallery folder

        Args:
            gfolder (str): name of the main gallery folder
            incol (str): name of the column in the dataframe with the
            gallery index with unique IDs for each elements (same as the local
            folder's names)
            expcol (str): name of the column with the data to export to JSON
            fname (str): name of the file to save

        Raises:
            exp: raise a generic exception if something goes wrong
        """
        try:
            # working variables
            idd = self.getdata(incol)
            expd = self.getdata(expcol)

            for tindex, tdata in zip(idd, expd):

                tfile = fname + ".json"
                self.write_json(tdata, tfile, gfolder, tindex)
                time.sleep(DEFAULT_SHORT_SLEEP_TIME)

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: export_json")

    def write_json(self, data, filename, *args):
        """
        Save a json into a local file according to the gallery folder
        and subfolders

        Args:
            data (JSON): JSON data to save in file
            filename (str): JSON fole name
            gfolder (str): name of the main gallery folder
            subfolders (str): list of subfolder names to the main gallery
            folder, can be as much as neeeded

        Raises:
            exp: raise a generic exception if something goes wrong
        """
        try:
            # configuring local filepath
            lfp = os.path.join(*args, filename)

            # saving data in with utf-8 encoding
            with open(lfp, "w", encoding="utf-8") as file:
                file.write(data)
                file.close()

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: write_json")

    def to_json(self, data):
        """
        transform a python dictionary into a JSON

        Args:
            data (dict): dictionary with the relevant data to transform

        Raises:
            exp: raise a generic exception if something goes wrong

        Returns:
            ans (JSOM): a proper JSON object containing the dictionary data
        """
        try:
            # transforming dictionary to JSON
            td = copy.deepcopy(data)
            ans = json.dumps(td, ensure_ascii=False, indent=4)

            # returning answer
            return ans

        # exception handling
        except Exception as exp:
            Err.reraise(exp, "Controller: to_json")