class Config: """ Class responsible for reading the configurations file """ def __init__(self): self.log = Logging("Config") self.config_parser = configparser.SafeConfigParser() self.read_file() def read_file(self): """ Read the content of the configurations file """ try: self.config_parser.read("src/config.ini") except configparser.Error as err: self.log.error(err) def get(self, section): """ Return the specified section of the configurations file as a dict """ try: return self.config_parser._sections[section] except configparser.Error as err: self.log.error(err) return []
class Authors: """ Class responsible for inserting the authors in the database """ def __init__(self, manga_id, authors=None): self.log = Logging("weeb_crawler") self.manga_id = manga_id if authors is None: self.authors = [] else: self.authors = authors def save(self): """ Save the manga authors at the database """ new_authors = 0 database = Database() check_query = """SELECT id FROM authors WHERE name=%s AND manga_id=%s""" insert_query = """INSERT INTO authors VALUES (NULL, %s, %s)""" for author in self.authors: result = database.execute(check_query, [author, self.manga_id]) if result is (): database.execute(insert_query, [author, self.manga_id]) new_authors += 1 self.log.info("Found %s new author(s)" % new_authors)
def __init__(self, manga_id, titles=None): self.log = Logging("weeb_crawler") self.manga_id = manga_id if titles is None: self.titles = [] else: self.titles = titles
def __init__(self, manga_id, artists=None): self.log = Logging("weeb_crawler") self.manga_id = manga_id if artists is None: self.artists = [] else: self.artists = artists
class Api: def __init__(self): self.cfg = Config() cfg = self.cfg.get("api") self.log = Logging("Api") self.debug = int(cfg["logging"]) self.host = cfg["host"] self.port = int(cfg["port"]) self.app = self.create_app() self.add_routes() def create_app(self): app = Flask(__name__) app.debug = self.debug app.logger.disable = not self.debug log = logging.getLogger("werkzeug") log.disabled = not self.debug cors = CORS(app) app.config['CORS_HEADERS'] = 'Content-Type' return app def add_routes(self): @self.app.route("/getLog", methods=["GET"]) @cross_origin() def get_log(): log_name = self.log.get_log_name() cfg = self.cfg.get("getLog") return_amount = int(cfg["return_amount"]) with open(log_name, "r") as log: return_data = { "name": log_name, "amount": return_amount, "content": log.readlines()[-return_amount:] } return json.dumps(return_data) @self.app.route("/getStats", methods=["GET"]) @cross_origin() def get_stats(): database = Database() result = database.execute("""SELECT (SELECT COUNT(id) FROM manga) AS manga_amount, (SELECT COUNT(id) FROM chapter) AS chapter_amount, (SELECT COUNT(id) FROM page) AS page_amount""" ) return json.dumps(result[0]) def run(self): self.log.info("Api started in %s:%s" % (self.host, self.port)) self.app.run(use_reloader=False, host=self.host, port=self.port)
def __init__(self, manga_id, authors=None): self.log = Logging("weeb_crawler") self.manga_id = manga_id if authors is None: self.authors = [] else: self.authors = authors
def __init__(self): self.cfg = Config() cfg = self.cfg.get("api") self.log = Logging("Api") self.debug = int(cfg["logging"]) self.host = cfg["host"] self.port = int(cfg["port"]) self.app = self.create_app() self.add_routes()
def __init__(self): conf = Config() database_conf = conf.get('database') self.host = database_conf['host'] self.name = database_conf['name'] self.user = database_conf['user'] self.passwd = database_conf['passwd'] self.log = Logging("Database") self.conn = self.connect()
def __init__(self, manga_id, urls=None): self.log = Logging("weeb_crawler") if urls is None: self.urls = [] else: self.urls = urls self.all_pages = False self.manga_id = manga_id
class Chapters: """ Get the chapters of the manga """ def __init__(self, manga_id, urls=None): self.log = Logging("weeb_crawler") if urls is None: self.urls = [] else: self.urls = urls self.all_pages = False self.manga_id = manga_id def save(self): """ Save the chapter in the database """ new_chapters = 0 database = Database() check_query = """SELECT id, all_pages FROM chapter WHERE manga_id=%s AND number=%s""" insert_query = """INSERT INTO chapter VALUES (NULL, %s, %s, 0, %s)""" update_query = """UPDATE chapter SET all_pages=1 WHERE id=%s""" for url in self.urls: chapter_id = None chapter_number = url.split("/")[-1] result = database.execute(check_query, [self.manga_id, chapter_number]) if result is (): database.execute(insert_query, [chapter_number, url, self.manga_id]) chapter_id = database.last_inserted_id() new_chapters += 1 else: chapter_id = result[0][0] self.all_pages = True if result[0][1] == 1 else False if not self.all_pages: chapter_pages = Pages(self.manga_id, chapter_id, url) chapter_pages.save() database.execute(update_query, [chapter_id]) self.log.info("Found %s new chapter(s)" % new_chapters)
def __init__(self, url): self.log = Logging("weeb_crawler") conf = Config() conf = conf.get("muID") self.title_diff_ratio = float(conf['diff_ratio']) self.url = url self.id = None query = """SELECT id FROM manga WHERE page_url=%s""" database = Database() result = database.execute(query, [url]) self.file = open("last-manga-content.html", "w") self.page = self.get_page() self.title = self.get_title() self.muID = self.get_mu_id() if result is (): self.description = self.get_description() self.alternative_titles = None self.gender_tags = None self.authors = None self.artists = None self.status = None self.get_header_info() self.save() self.save_titles() self.save_authors() self.save_artists() self.save_gender() self.get_covers() self.get_chapters() self.log.info("Added new Manga: %s" % self.title) else: self.id = result[0][0] self.get_covers() self.get_chapters() self.log.info("Updated Manga: %s" % self.title)
class Titles: """ Class responsible for inserting the manga alternative titles in the database """ def __init__(self, manga_id, titles=None): self.log = Logging("weeb_crawler") self.manga_id = manga_id if titles is None: self.titles = [] else: self.titles = titles def save(self): """ Insert the manga alternative titles in the database """ new_titles = 0 database = Database() check_query = """SELECT id FROM titles WHERE name=%s AND manga_id=%s""" insert_query = """INSERT INTO titles VALUES (NULL, %s, %s)""" for title in self.titles: result = database.execute(check_query, [title, self.manga_id]) title = title[1:] if title[0] == " " else title if title != "-" and result is (): database.execute(insert_query, [title, self.manga_id]) new_titles += 1 self.log.info("Found %s new alternative title(s)" % new_titles)
def __init__(self, url): self.log = Logging("Requests") self.url = url self.decode_gzip = lambda response: zlib.decompress( response, 16 + zlib.MAX_WBITS)
class Manga: """ Manga class, one of the main classes of the program. Responsible for capturing and saving the mais piece of the system """ def __init__(self, url): self.log = Logging("weeb_crawler") conf = Config() conf = conf.get("muID") self.title_diff_ratio = float(conf['diff_ratio']) self.url = url self.id = None query = """SELECT id FROM manga WHERE page_url=%s""" database = Database() result = database.execute(query, [url]) self.file = open("last-manga-content.html", "w") self.page = self.get_page() self.title = self.get_title() self.muID = self.get_mu_id() if result is (): self.description = self.get_description() self.alternative_titles = None self.gender_tags = None self.authors = None self.artists = None self.status = None self.get_header_info() self.save() self.save_titles() self.save_authors() self.save_artists() self.save_gender() self.get_covers() self.get_chapters() self.log.info("Added new Manga: %s" % self.title) else: self.id = result[0][0] self.get_covers() self.get_chapters() self.log.info("Updated Manga: %s" % self.title) def get_page(self): """ Requests the manga page and returns a soup """ req = Request(self.url) soup = req.soup() self.file.write(str(soup.encode("utf-8"))) return soup def get_title(self): """ Get the manga title, return the text of the first h2 """ return self.page.findAll("h2")[0].text def get_description(self): """ Get the manga description, search for a div with a specific class and returns the inner text """ desc_container = self.page.find("div", {"class": "panel panel-default"}) desc_body = desc_container.find("div", {"class": "panel-body"}) return desc_body.text def get_header_info(self): """ Get the manga indo present on the top of the page """ header_content = self.page.findAll( "h4", {"class": "media-heading manga-perfil"}) self.alternative_titles = re.split(', ', header_content[0].contents[1:][0]) gender_tags_container = header_content[1].findAll("a", {"href": True}) self.gender_tags = [tag.text for tag in gender_tags_container] self.authors = header_content[2].contents[1:] self.artists = header_content[3].contents[1:] status_tag = header_content[4].find("span") self.status = status_tag.text def get_mu_id(self): """ Uses MCD api to search for the id to be user on MCD and Manga Updates """ mu_id = None req = Request('https://mcd.iosphe.re/api/v1/search/') results = req.get_json({"Title": self.title}) for result in results['Results']: if SequenceMatcher(None, self.title, result[1]).ratio() > self.title_diff_ratio: mu_id = result[0] return mu_id def get_chapters(self): """ Get all the chapter from the manga main page, then save each one """ chapters = [] chapter_containers = self.page.findAll( "div", {"class": "row lancamento-linha"}) for chapter_container in chapter_containers: chapter_container = chapter_container.findAll( "div", {"class": "col-xs-6 col-md-6"})[0] chapters.append( chapter_container.findAll("a", {"href": True})[0]['href']) chapter = Chapters(self.id, chapters) chapter.save() def get_covers(self): """ Call the covers class and its methods """ if self.muID is not None: covers = Covers(self.id, self.muID) covers.get() covers.save() def save(self): """ Save the manga in the database """ database = Database() query = """INSERT INTO manga VALUES (NULL, %s, %s, %s, %s, %s, %s, %s, %s)""" database.execute(query, [ self.muID, self.url, None, self.title, self.description, self.status, 0, None ]) self.id = database.last_inserted_id() def save_authors(self): """ Save the manga authors """ authors = Authors(self.id, self.authors) authors.save() def save_artists(self): """ Save the manga artists """ artists = Artists(self.id, self.artists) artists.save() def save_titles(self): """ Save the manga alternative titles """ titles = Titles(self.id, self.alternative_titles) titles.save() def save_gender(self): """ Save the manga genders """ genders = Genders(self.id, self.gender_tags) genders.save()
def __init__(self): self.log = Logging("Config") self.config_parser = configparser.SafeConfigParser() self.read_file()
class Request: """ Handles the requests """ def __init__(self, url): self.log = Logging("Requests") self.url = url self.decode_gzip = lambda response: zlib.decompress( response, 16 + zlib.MAX_WBITS) def header(self, header_type): """ Returns the get header stored in the json file """ try: header_file = open("src/headers/%s.json" % header_type, "r") header_obj = json.loads(header_file.read()) return header_obj except (OSError, IOError) as err: self.log.error(err) return {'': ''} def request_page(self): """ Makes the actual request, return a soup """ try: req = get(self.url, headers=self.header('get')) while req.status_code != 200: req = get(self.url, headers=self.header('get')) if req.encoding == 'gzip': return self.decode_gzip(req.text) else: return req.text except exceptions.TooManyRedirects: self.log.error("Request too many redirections on url <%s>" % self.url) return "Error" except exceptions.Timeout: self.log.error("Request timeout on url <%s>" % self.url) return "Error" except exceptions.RequestException as err: self.log.error(err) return "Error" def get_json(self, send_data=None): """ Send json post and expects a json return """ if send_data is None: send_data = [] try: req = post(self.url, headers=self.header('post'), data=json.dumps(send_data)) return req.json() except Exception as err: self.log.error(err) def soup(self): """ Request wrapper, to garante request is successful """ request_result = self.request_page() while request_result is "Error": request_result = self.request_page() return BeautifulSoup(request_result, "html5lib")
class Database(metaclass=Singleton): """ Class responsible for connecting and making operations on the database, it uses a singleton to maintain a always open connection, thus saving resources """ def __init__(self): conf = Config() database_conf = conf.get('database') self.host = database_conf['host'] self.name = database_conf['name'] self.user = database_conf['user'] self.passwd = database_conf['passwd'] self.log = Logging("Database") self.conn = self.connect() def connect(self): """ Responsible for opening the connection to the database and setting the charset """ try: conn = pymysql.connect(self.host, self.user, self.passwd, self.name) cursor = conn.cursor() conn.set_charset('utf8') cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') return conn except pymysql.MySQLError as err: self.log.error(err) return False def disconnect(self): """ Responsible for closing the connection to the database """ self.conn.close() def execute(self, query, params=None): """ Responsible for executing queries on the database """ if params is None: params = [] if not self.conn.open: self.conn = self.connect() try: cursor = self.conn.cursor() cursor.execute(query, params) self.conn.commit() return cursor.fetchall() except pymysql.MySQLError as err: self.log.error(err) return [] def last_inserted_id(self): try: query = """SELECT LAST_INSERT_ID()""" return self.execute(query)[0][0] except Exception as err: self.log.error(err)