# -*- coding:utf-8 -*- from mylogging import MyLogger from db import DB from crawler import Crawler START_PAGE_URL = "http://namu.wiki/w/" db = DB() mainLogFile = 'log/main.log' mainLogger = MyLogger(mainLogFile) if __name__ == '__main__': crawler = Crawler() db.makeNamuwikiTable() url = START_PAGE_URL selectRecentUrl = db.selectRecentUrl() mainLogger.info("selectRecentUrl : " + selectRecentUrl) if len(selectRecentUrl) > 0: url = selectRecentUrl while True: mainLogger.debug("main url : " + url) try: crawler.getCrawl(url, 0) except: mainLogger.error("error url : " + url) url = crawler.getRecentChangeLink()
# -*- coding:utf-8 -*- import config import pymysql from mylogging import MyLogger dbConnectLogFile = 'log/dbConnect.log' dbConnectLogger = MyLogger(dbConnectLogFile) class DBConnect(object): def __init__(self): dbConnectLogger.debug(("DBConnect init")) self._db_connection = pymysql.connect(host=config.DATABASE_CONFIG['host'], user=config.DATABASE_CONFIG['user'], password=config.DATABASE_CONFIG['password'], db=config.DATABASE_CONFIG['dbname']) self._db_connection.set_charset('utf8mb4') self._db_cur = self._db_connection.cursor() def query(self, query, params=None): dbConnectLogger.debug("db query execute") self._db_cur.execute(query) return self._db_cur.fetchone() def insert(self, query, params=None): dbConnectLogger.debug("db insert") self._db_cur.execute(query, params) return self._db_connection.commit()
from mylogging import MyLogger import db from SubScraper import gomSubScraper import SubEditor subScrapDataLogFile = 'log/subScrapData.log' subScrapDataLogger = MyLogger(subScrapDataLogFile) if __name__ == '__main__': gomSubScraper = gomSubScraper() # gomLastPage = gomSubScraper.getLastPage() # for page in range(1, gomLastPage + 1): # smiList = gomSubScraper.getSortSmiList(str(page)) # SubEditor.sortTxt(smiList[0]) SubEditor.sortTXT(gomSubScraper.getSortSmiList('0')) # print(SubEditor.sortTXT(smi)) # print('starting Gom Crawl.py...') # lastBoardNum = getGomLastBoard(gom_mainBoardPage) # # lastBoardNum = 1 # # print ("lastBoardNum : ", lastBoardNum) # pageURLList = [] # pageURLList = getGomAllBoardPageURL(lastBoardNum) # titleList = [] # # getGomTitleLink(pageURLList) # # while True: # time.sleep(0.5) # db.connect(HOST, PORT)
# -*- coding:utf-8 -*- from bs4 import BeautifulSoup import re import requests from requests.compat import urljoin from mylogging import MyLogger import SubEditor subScrapLogFile = 'log/subScrap.log' subScrapLogger = MyLogger(subScrapLogFile) gomBaseUrl = 'http://gom.gomtv.com' def getHtml(url): html = "" resp = requests.get(url) if resp.status_code == 200: html = resp.text return html class gomSubScraper(): def __init__(self): self.lastPage = 1 self.dbTuple = tuple() self.gomKorEnBoardUrl = "http://gom.gomtv.com/main/index.html?ch=subtitles&pt=l&menu=subtitles&lang=3&page=" def getLastPage(self): self.getGomLastBoard() return self.lastPage
# -*- coding:utf-8 -*- import re import requests from requests.compat import urljoin from mylogging import MyLogger from bs4 import BeautifulSoup from db import DB import time from urllib.parse import quote crawlLogFile = "log/crawler.log" crawlLogger = MyLogger(crawlLogFile) BASE_URL = "https://namu.wiki/w/" RECENTCHANGE_URL = "https://namu.wiki/sidebar.json" db = DB() CRAWL_TERM = 3.0 class Crawler(): def __init__(self): self.url = "" self.title = "" self.image = "" self.editdate = "" self.content = "" self.html = "" self.bsObj = "" self.linkList = []
# -*- coding:utf-8 -*- from bs4 import BeautifulSoup from urllib import * import re # import konlpy # import nltk # from konlpy.tag import Komoran # from konlpy.tag import Twitter # from nltk.tokenize import RegexpTokenizer from mylogging import MyLogger tokenizer = None tagger = None subEditorLogFile = 'log/subEditor.log' subEditorLogger = MyLogger(subEditorLogFile) pStyle1 = re.compile('<br>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle2 = re.compile('<font color=.*?>', re.IGNORECASE) pStyle3 = re.compile('</font>', re.IGNORECASE) pStyle4 = re.compile('<HEAD(.*?)>(.*?)</HEAD>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle5 = re.compile('<!--(.*?)-->', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle6 = re.compile('<br>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle7 = re.compile('<SAMI>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle8 = re.compile('<BODY>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle9 = re.compile('</SAMI>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle10 = re.compile('</BODY>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle11 = re.compile('<i>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle12 = re.compile('</i>', re.IGNORECASE | re.MULTILINE | re.DOTALL) pStyle13 = re.compile(r'<SYNC Start=\d+><P Class=KR.*> ')
from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer import string from textblob.classifiers import NaiveBayesClassifier import pandas as pd from mylogging import MyLogger emotionClassifyLogFile = 'log/emotionClassify.log' emotionClassifyLogger = MyLogger(emotionClassifyLogFile) dataFrame2 = '''On days when I feel close to my partner and other friends. When I feel at peace with myself and also experience a close contact with people whom I regard greatly.''' class emotion_classify: def __init__(self): emotionClassifyLogger.debug("emo_classify init") self.df = pd.read_csv('ISEAR.csv') self.a = pd.Series(self.df['joy']) self.b = pd.Series(self.df[dataFrame2]) self.new_df = pd.DataFrame({'Text': self.b, 'Emotion': self.a}) self.stop = set(stopwords.words('english')) self.exclude = set(string.punctuation) self.lemma = WordNetLemmatizer()
# -*- coding:utf-8 -*- from mylogging import MyLogger import dbConnect import pymysql dbLogFile = 'log/db.log' dbLogger = MyLogger(dbLogFile) dbi = dbConnect.DBConnect() class DB(): def makeSubTable(self): makeTableQuery = """ CREATE TABLE IF NOT EXISTS `sub_db`.`sub` ( `id` INT NOT NULL AUTO_INCREMENT, `smiFileName` VARCHAR(200) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NOT NULL, `engSentence` VARCHAR(500) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NOT NULL, `korSentence` VARCHAR(500) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NOT NULL, `emotion` VARCHAR(45) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NULL DEFAULT NULL, PRIMARY KEY (`id`)) ENGINE = InnoDB DEFAULT CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci; """ dbLogger.debug("makeSubDB") try: return dbi.query(makeTableQuery) except Exception as e: return dbLogger.error(e)
from mylogging import MyLogger from db import DB from SubScraper import gomSubScraper from EmotionClassifier import emotion_classify subScrapDataLogFile = 'log/subScrapData.log' subScrapDataLogger = MyLogger(subScrapDataLogFile) if __name__ == '__main__': gomSubScraper = gomSubScraper() db = DB() ec = emotion_classify() db.makeSubTable() gomLastPage = gomSubScraper.getLastPage() for page in range(1, gomLastPage + 1): sortSmiDictList = gomSubScraper.getSortSmiList(str(page)) for sortSmiDict in sortSmiDictList: smiFileName = list(sortSmiDict.keys())[0] smiList = list(sortSmiDict.values())[0] for enSmi, korSmi in smiList: emotion = ec.classify_text(enSmi) subScrapDataLogger.info(smiFileName + enSmi + korSmi + emotion) db.insertSubDB((smiFileName, enSmi, korSmi, emotion))