Пример #1
0
class Config:
    """
    Class responsible for reading the configurations file
    """
    def __init__(self):

        self.log = Logging("Config")

        self.config_parser = configparser.SafeConfigParser()
        self.read_file()

    def read_file(self):
        """
        Read the content of the configurations file
        """
        try:
            self.config_parser.read("src/config.ini")

        except configparser.Error as err:
            self.log.error(err)

    def get(self, section):
        """
        Return the specified section of the configurations file
        as a dict
        """
        try:
            return self.config_parser._sections[section]

        except configparser.Error as err:
            self.log.error(err)
            return []
Пример #2
0
class Authors:

    """
    Class responsible for inserting the authors in the database
    """

    def __init__(self, manga_id, authors=None):

        self.log = Logging("weeb_crawler")

        self.manga_id = manga_id
        if authors is None:
            self.authors = []
        else:
            self.authors = authors


    def save(self):
        """
        Save the manga authors at the database
        """
        new_authors = 0
        database = Database()
        check_query = """SELECT id FROM authors WHERE name=%s AND manga_id=%s"""
        insert_query = """INSERT INTO authors VALUES (NULL, %s, %s)"""
        for author in self.authors:
            result = database.execute(check_query, [author, self.manga_id])
            if result is ():
                database.execute(insert_query, [author, self.manga_id])
                new_authors += 1

        self.log.info("Found %s new author(s)" % new_authors)
Пример #3
0
    def __init__(self, manga_id, titles=None):

        self.log = Logging("weeb_crawler")

        self.manga_id = manga_id
        if titles is None:
            self.titles = []
        else:
            self.titles = titles
Пример #4
0
    def __init__(self, manga_id, artists=None):

        self.log = Logging("weeb_crawler")

        self.manga_id = manga_id
        if artists is None:
            self.artists = []
        else:
            self.artists = artists
Пример #5
0
class Api:
    def __init__(self):

        self.cfg = Config()
        cfg = self.cfg.get("api")

        self.log = Logging("Api")
        self.debug = int(cfg["logging"])
        self.host = cfg["host"]
        self.port = int(cfg["port"])
        self.app = self.create_app()
        self.add_routes()

    def create_app(self):

        app = Flask(__name__)
        app.debug = self.debug

        app.logger.disable = not self.debug
        log = logging.getLogger("werkzeug")
        log.disabled = not self.debug

        cors = CORS(app)
        app.config['CORS_HEADERS'] = 'Content-Type'

        return app

    def add_routes(self):
        @self.app.route("/getLog", methods=["GET"])
        @cross_origin()
        def get_log():
            log_name = self.log.get_log_name()
            cfg = self.cfg.get("getLog")
            return_amount = int(cfg["return_amount"])
            with open(log_name, "r") as log:
                return_data = {
                    "name": log_name,
                    "amount": return_amount,
                    "content": log.readlines()[-return_amount:]
                }
                return json.dumps(return_data)

        @self.app.route("/getStats", methods=["GET"])
        @cross_origin()
        def get_stats():
            database = Database()
            result = database.execute("""SELECT
                                        (SELECT COUNT(id) FROM manga) AS manga_amount,
                                        (SELECT COUNT(id) FROM chapter) AS chapter_amount,
                                        (SELECT COUNT(id) FROM page) AS page_amount"""
                                      )
            return json.dumps(result[0])

    def run(self):

        self.log.info("Api started in %s:%s" % (self.host, self.port))
        self.app.run(use_reloader=False, host=self.host, port=self.port)
Пример #6
0
    def __init__(self, manga_id, authors=None):

        self.log = Logging("weeb_crawler")

        self.manga_id = manga_id
        if authors is None:
            self.authors = []
        else:
            self.authors = authors
Пример #7
0
    def __init__(self):

        self.cfg = Config()
        cfg = self.cfg.get("api")

        self.log = Logging("Api")
        self.debug = int(cfg["logging"])
        self.host = cfg["host"]
        self.port = int(cfg["port"])
        self.app = self.create_app()
        self.add_routes()
Пример #8
0
    def __init__(self):

        conf = Config()
        database_conf = conf.get('database')
        self.host = database_conf['host']
        self.name = database_conf['name']
        self.user = database_conf['user']
        self.passwd = database_conf['passwd']

        self.log = Logging("Database")
        self.conn = self.connect()
Пример #9
0
    def __init__(self, manga_id, urls=None):

        self.log = Logging("weeb_crawler")

        if urls is None:
            self.urls = []
        else:
            self.urls = urls

        self.all_pages = False
        self.manga_id = manga_id
Пример #10
0
class Chapters:
    """
    Get the chapters of the manga
    """
    def __init__(self, manga_id, urls=None):

        self.log = Logging("weeb_crawler")

        if urls is None:
            self.urls = []
        else:
            self.urls = urls

        self.all_pages = False
        self.manga_id = manga_id

    def save(self):
        """
        Save the chapter in the database
        """
        new_chapters = 0
        database = Database()
        check_query = """SELECT id, all_pages FROM chapter WHERE manga_id=%s AND number=%s"""
        insert_query = """INSERT INTO chapter VALUES (NULL, %s, %s, 0, %s)"""
        update_query = """UPDATE chapter SET all_pages=1 WHERE id=%s"""
        for url in self.urls:
            chapter_id = None
            chapter_number = url.split("/")[-1]
            result = database.execute(check_query,
                                      [self.manga_id, chapter_number])

            if result is ():
                database.execute(insert_query,
                                 [chapter_number, url, self.manga_id])
                chapter_id = database.last_inserted_id()
                new_chapters += 1

            else:
                chapter_id = result[0][0]
                self.all_pages = True if result[0][1] == 1 else False

            if not self.all_pages:
                chapter_pages = Pages(self.manga_id, chapter_id, url)
                chapter_pages.save()
                database.execute(update_query, [chapter_id])

        self.log.info("Found %s new chapter(s)" % new_chapters)
Пример #11
0
    def __init__(self, url):

        self.log = Logging("weeb_crawler")

        conf = Config()
        conf = conf.get("muID")
        self.title_diff_ratio = float(conf['diff_ratio'])

        self.url = url
        self.id = None

        query = """SELECT id FROM manga WHERE page_url=%s"""
        database = Database()
        result = database.execute(query, [url])

        self.file = open("last-manga-content.html", "w")

        self.page = self.get_page()
        self.title = self.get_title()
        self.muID = self.get_mu_id()
        if result is ():
            self.description = self.get_description()
            self.alternative_titles = None
            self.gender_tags = None
            self.authors = None
            self.artists = None
            self.status = None
            self.get_header_info()

            self.save()
            self.save_titles()
            self.save_authors()
            self.save_artists()
            self.save_gender()
            self.get_covers()
            self.get_chapters()
            self.log.info("Added new Manga: %s" % self.title)

        else:
            self.id = result[0][0]
            self.get_covers()
            self.get_chapters()
            self.log.info("Updated Manga: %s" % self.title)
Пример #12
0
class Titles:

    """
    Class responsible for inserting the manga alternative
    titles in the database
    """

    def __init__(self, manga_id, titles=None):

        self.log = Logging("weeb_crawler")

        self.manga_id = manga_id
        if titles is None:
            self.titles = []
        else:
            self.titles = titles


    def save(self):
        """
        Insert the manga alternative titles
        in the database
        """
        new_titles = 0
        database = Database()
        check_query = """SELECT id FROM titles WHERE name=%s AND manga_id=%s"""
        insert_query = """INSERT INTO titles VALUES (NULL, %s, %s)"""
        for title in self.titles:
            result = database.execute(check_query, [title, self.manga_id])
            title = title[1:] if title[0] == " " else title

            if title != "-" and result is ():
                database.execute(insert_query, [title, self.manga_id])
                new_titles += 1

        self.log.info("Found %s new alternative title(s)" % new_titles)
Пример #13
0
    def __init__(self, url):

        self.log = Logging("Requests")
        self.url = url
        self.decode_gzip = lambda response: zlib.decompress(
            response, 16 + zlib.MAX_WBITS)
Пример #14
0
class Manga:
    """
    Manga class, one of the main classes of the program.
    Responsible for capturing and saving the mais piece
    of the system
    """
    def __init__(self, url):

        self.log = Logging("weeb_crawler")

        conf = Config()
        conf = conf.get("muID")
        self.title_diff_ratio = float(conf['diff_ratio'])

        self.url = url
        self.id = None

        query = """SELECT id FROM manga WHERE page_url=%s"""
        database = Database()
        result = database.execute(query, [url])

        self.file = open("last-manga-content.html", "w")

        self.page = self.get_page()
        self.title = self.get_title()
        self.muID = self.get_mu_id()
        if result is ():
            self.description = self.get_description()
            self.alternative_titles = None
            self.gender_tags = None
            self.authors = None
            self.artists = None
            self.status = None
            self.get_header_info()

            self.save()
            self.save_titles()
            self.save_authors()
            self.save_artists()
            self.save_gender()
            self.get_covers()
            self.get_chapters()
            self.log.info("Added new Manga: %s" % self.title)

        else:
            self.id = result[0][0]
            self.get_covers()
            self.get_chapters()
            self.log.info("Updated Manga: %s" % self.title)

    def get_page(self):
        """
        Requests the manga page and returns a soup
        """
        req = Request(self.url)
        soup = req.soup()
        self.file.write(str(soup.encode("utf-8")))
        return soup

    def get_title(self):
        """
        Get the manga title, return the text of the first h2
        """
        return self.page.findAll("h2")[0].text

    def get_description(self):
        """
        Get the manga description, search for a div with
        a specific class and returns the inner text
        """
        desc_container = self.page.find("div",
                                        {"class": "panel panel-default"})
        desc_body = desc_container.find("div", {"class": "panel-body"})
        return desc_body.text

    def get_header_info(self):
        """
        Get the manga indo present on the top of the page
        """
        header_content = self.page.findAll(
            "h4", {"class": "media-heading manga-perfil"})

        self.alternative_titles = re.split(', ',
                                           header_content[0].contents[1:][0])

        gender_tags_container = header_content[1].findAll("a", {"href": True})
        self.gender_tags = [tag.text for tag in gender_tags_container]

        self.authors = header_content[2].contents[1:]
        self.artists = header_content[3].contents[1:]

        status_tag = header_content[4].find("span")
        self.status = status_tag.text

    def get_mu_id(self):
        """
        Uses MCD api to search for the
        id to be user on MCD and Manga Updates
        """
        mu_id = None
        req = Request('https://mcd.iosphe.re/api/v1/search/')
        results = req.get_json({"Title": self.title})
        for result in results['Results']:
            if SequenceMatcher(None, self.title,
                               result[1]).ratio() > self.title_diff_ratio:
                mu_id = result[0]

        return mu_id

    def get_chapters(self):
        """
        Get all the chapter from the manga main page,
        then save each one
        """
        chapters = []
        chapter_containers = self.page.findAll(
            "div", {"class": "row lancamento-linha"})
        for chapter_container in chapter_containers:
            chapter_container = chapter_container.findAll(
                "div", {"class": "col-xs-6 col-md-6"})[0]
            chapters.append(
                chapter_container.findAll("a", {"href": True})[0]['href'])

        chapter = Chapters(self.id, chapters)
        chapter.save()

    def get_covers(self):
        """
        Call the covers class and its methods
        """
        if self.muID is not None:
            covers = Covers(self.id, self.muID)
            covers.get()
            covers.save()

    def save(self):
        """
        Save the manga in the database
        """
        database = Database()
        query = """INSERT INTO manga VALUES (NULL, %s, %s, %s, %s, %s, %s, %s, %s)"""
        database.execute(query, [
            self.muID, self.url, None, self.title, self.description,
            self.status, 0, None
        ])
        self.id = database.last_inserted_id()

    def save_authors(self):
        """
        Save the manga authors
        """
        authors = Authors(self.id, self.authors)
        authors.save()

    def save_artists(self):
        """
        Save the manga artists
        """
        artists = Artists(self.id, self.artists)
        artists.save()

    def save_titles(self):
        """
        Save the manga alternative titles
        """
        titles = Titles(self.id, self.alternative_titles)
        titles.save()

    def save_gender(self):
        """
        Save the manga genders
        """
        genders = Genders(self.id, self.gender_tags)
        genders.save()
Пример #15
0
    def __init__(self):

        self.log = Logging("Config")

        self.config_parser = configparser.SafeConfigParser()
        self.read_file()
Пример #16
0
class Request:
    """
    Handles the requests
    """
    def __init__(self, url):

        self.log = Logging("Requests")
        self.url = url
        self.decode_gzip = lambda response: zlib.decompress(
            response, 16 + zlib.MAX_WBITS)

    def header(self, header_type):
        """
        Returns the get header stored in the json file
        """
        try:
            header_file = open("src/headers/%s.json" % header_type, "r")
            header_obj = json.loads(header_file.read())
            return header_obj

        except (OSError, IOError) as err:
            self.log.error(err)
            return {'': ''}

    def request_page(self):
        """
        Makes the actual request, return a soup
        """
        try:
            req = get(self.url, headers=self.header('get'))
            while req.status_code != 200:
                req = get(self.url, headers=self.header('get'))

            if req.encoding == 'gzip':
                return self.decode_gzip(req.text)
            else:
                return req.text

        except exceptions.TooManyRedirects:
            self.log.error("Request too many redirections on url <%s>" %
                           self.url)
            return "Error"

        except exceptions.Timeout:
            self.log.error("Request timeout on url <%s>" % self.url)
            return "Error"

        except exceptions.RequestException as err:
            self.log.error(err)
            return "Error"

    def get_json(self, send_data=None):
        """
        Send json post and expects a json return
        """
        if send_data is None:
            send_data = []

        try:
            req = post(self.url,
                       headers=self.header('post'),
                       data=json.dumps(send_data))
            return req.json()

        except Exception as err:
            self.log.error(err)

    def soup(self):
        """
        Request wrapper, to garante request is successful
        """
        request_result = self.request_page()
        while request_result is "Error":
            request_result = self.request_page()

        return BeautifulSoup(request_result, "html5lib")
Пример #17
0
class Database(metaclass=Singleton):

    """
    Class responsible for connecting and making
    operations on the database, it uses a singleton
    to maintain a always open connection,
    thus saving resources
    """

    def __init__(self):

        conf = Config()
        database_conf = conf.get('database')
        self.host = database_conf['host']
        self.name = database_conf['name']
        self.user = database_conf['user']
        self.passwd = database_conf['passwd']

        self.log = Logging("Database")
        self.conn = self.connect()


    def connect(self):
        """
        Responsible for opening the connection to the database
        and setting the charset
        """
        try:
            conn = pymysql.connect(self.host,
                                   self.user,
                                   self.passwd,
                                   self.name)
            cursor = conn.cursor()
            conn.set_charset('utf8')
            cursor.execute('SET NAMES utf8;')
            cursor.execute('SET CHARACTER SET utf8;')
            cursor.execute('SET character_set_connection=utf8;')
            return conn

        except pymysql.MySQLError as err:
            self.log.error(err)
            return False


    def disconnect(self):
        """
        Responsible for closing the connection to the database
        """
        self.conn.close()


    def execute(self, query, params=None):
        """
        Responsible for executing queries on the database
        """
        if params is None:
            params = []

        if not self.conn.open:
            self.conn = self.connect()

        try:
            cursor = self.conn.cursor()
            cursor.execute(query, params)
            self.conn.commit()
            return cursor.fetchall()

        except pymysql.MySQLError as err:
            self.log.error(err)
            return []


    def last_inserted_id(self):

        try:
            query = """SELECT LAST_INSERT_ID()"""
            return self.execute(query)[0][0]

        except Exception as err:
            self.log.error(err)