Exemplo n.º 1
0
# -*- coding:utf-8 -*-
from mylogging import MyLogger
from db import DB
from crawler import Crawler

START_PAGE_URL = "http://namu.wiki/w/"
db = DB()

mainLogFile = 'log/main.log'
mainLogger = MyLogger(mainLogFile)

if __name__ == '__main__':
    crawler = Crawler()
    db.makeNamuwikiTable()

    url = START_PAGE_URL
    selectRecentUrl = db.selectRecentUrl()

    mainLogger.info("selectRecentUrl : " + selectRecentUrl)

    if len(selectRecentUrl) > 0:
        url = selectRecentUrl

    while True:
        mainLogger.debug("main url : " + url)
        try:
            crawler.getCrawl(url, 0)
        except:
            mainLogger.error("error url : " + url)

        url = crawler.getRecentChangeLink()
Exemplo n.º 2
0
# -*- coding:utf-8 -*-
import config
import pymysql
from mylogging import MyLogger

dbConnectLogFile = 'log/dbConnect.log'
dbConnectLogger = MyLogger(dbConnectLogFile)

class DBConnect(object):
    def __init__(self):
        dbConnectLogger.debug(("DBConnect init"))
        self._db_connection = pymysql.connect(host=config.DATABASE_CONFIG['host'],
                               user=config.DATABASE_CONFIG['user'],
                               password=config.DATABASE_CONFIG['password'],
                               db=config.DATABASE_CONFIG['dbname'])
        self._db_connection.set_charset('utf8mb4')
        self._db_cur = self._db_connection.cursor()


    def query(self, query, params=None):
        dbConnectLogger.debug("db query execute")
        self._db_cur.execute(query)
        return self._db_cur.fetchone()


    def insert(self, query, params=None):
        dbConnectLogger.debug("db insert")
        self._db_cur.execute(query, params)
        return self._db_connection.commit()

Exemplo n.º 3
0
from mylogging import MyLogger
import db
from SubScraper import gomSubScraper
import SubEditor

subScrapDataLogFile = 'log/subScrapData.log'
subScrapDataLogger = MyLogger(subScrapDataLogFile)

if __name__ == '__main__':
    gomSubScraper = gomSubScraper()
    # gomLastPage = gomSubScraper.getLastPage()
    # for page in range(1, gomLastPage + 1):
    #     smiList = gomSubScraper.getSortSmiList(str(page))
    # SubEditor.sortTxt(smiList[0])

    SubEditor.sortTXT(gomSubScraper.getSortSmiList('0'))
    # print(SubEditor.sortTXT(smi))

    # print('starting Gom Crawl.py...')
    # lastBoardNum = getGomLastBoard(gom_mainBoardPage)
    # # lastBoardNum = 1
    # # print ("lastBoardNum : ", lastBoardNum)
    # pageURLList = []
    # pageURLList = getGomAllBoardPageURL(lastBoardNum)
    # titleList = []
    #
    # getGomTitleLink(pageURLList)
    #
    # while True:
    #     time.sleep(0.5)
    #     db.connect(HOST, PORT)
Exemplo n.º 4
0
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import re
import requests
from requests.compat import urljoin
from mylogging import MyLogger
import SubEditor

subScrapLogFile = 'log/subScrap.log'
subScrapLogger = MyLogger(subScrapLogFile)

gomBaseUrl = 'http://gom.gomtv.com'


def getHtml(url):
    html = ""
    resp = requests.get(url)
    if resp.status_code == 200:
        html = resp.text
    return html


class gomSubScraper():
    def __init__(self):
        self.lastPage = 1
        self.dbTuple = tuple()
        self.gomKorEnBoardUrl = "http://gom.gomtv.com/main/index.html?ch=subtitles&pt=l&menu=subtitles&lang=3&page="

    def getLastPage(self):
        self.getGomLastBoard()
        return self.lastPage
Exemplo n.º 5
0
# -*- coding:utf-8 -*-
import re
import requests
from requests.compat import urljoin
from mylogging import MyLogger
from bs4 import BeautifulSoup
from db import DB
import time
from urllib.parse import quote

crawlLogFile = "log/crawler.log"
crawlLogger = MyLogger(crawlLogFile)

BASE_URL = "https://namu.wiki/w/"
RECENTCHANGE_URL = "https://namu.wiki/sidebar.json"

db = DB()
CRAWL_TERM = 3.0


class Crawler():
    def __init__(self):
        self.url = ""
        self.title = ""
        self.image = ""
        self.editdate = ""
        self.content = ""
        self.html = ""
        self.bsObj = ""
        self.linkList = []
Exemplo n.º 6
0
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib import *
import re
# import konlpy
# import nltk
# from konlpy.tag import Komoran
# from konlpy.tag import Twitter
# from nltk.tokenize import RegexpTokenizer
from mylogging import MyLogger

tokenizer = None
tagger = None

subEditorLogFile = 'log/subEditor.log'
subEditorLogger = MyLogger(subEditorLogFile)

pStyle1 = re.compile('<br>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle2 = re.compile('<font color=.*?>', re.IGNORECASE)
pStyle3 = re.compile('</font>', re.IGNORECASE)
pStyle4 = re.compile('<HEAD(.*?)>(.*?)</HEAD>',
                     re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle5 = re.compile('<!--(.*?)-->', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle6 = re.compile('<br>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle7 = re.compile('<SAMI>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle8 = re.compile('<BODY>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle9 = re.compile('</SAMI>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle10 = re.compile('</BODY>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle11 = re.compile('<i>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle12 = re.compile('</i>', re.IGNORECASE | re.MULTILINE | re.DOTALL)
pStyle13 = re.compile(r'<SYNC Start=\d+><P Class=KR.*>&nbsp;')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

from textblob.classifiers import NaiveBayesClassifier
import pandas as pd

from mylogging import MyLogger


emotionClassifyLogFile = 'log/emotionClassify.log'
emotionClassifyLogger = MyLogger(emotionClassifyLogFile)


dataFrame2 = '''On days when I feel close to my partner and other friends.   
When I feel at peace with myself and also experience a close  
contact with people whom I regard greatly.'''

class emotion_classify:
    def __init__(self):
        emotionClassifyLogger.debug("emo_classify init")

        self.df = pd.read_csv('ISEAR.csv')
        self.a = pd.Series(self.df['joy'])
        self.b = pd.Series(self.df[dataFrame2])
        self.new_df = pd.DataFrame({'Text': self.b, 'Emotion': self.a})

        self.stop = set(stopwords.words('english'))
        self.exclude = set(string.punctuation)
        self.lemma = WordNetLemmatizer()
Exemplo n.º 8
0
# -*- coding:utf-8 -*-
from mylogging import MyLogger
import dbConnect
import pymysql

dbLogFile = 'log/db.log'
dbLogger = MyLogger(dbLogFile)

dbi = dbConnect.DBConnect()


class DB():
    def makeSubTable(self):
        makeTableQuery = """
        CREATE TABLE IF NOT EXISTS `sub_db`.`sub` (
            `id` INT NOT NULL AUTO_INCREMENT,
            `smiFileName` VARCHAR(200) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NOT NULL,
            `engSentence` VARCHAR(500) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NOT NULL,
            `korSentence` VARCHAR(500) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NOT NULL,
            `emotion` VARCHAR(45) CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci' NULL DEFAULT NULL,
            PRIMARY KEY (`id`))
            ENGINE = InnoDB
            DEFAULT CHARACTER SET = utf8mb4
            COLLATE = utf8mb4_unicode_ci;
        """

        dbLogger.debug("makeSubDB")
        try:
            return dbi.query(makeTableQuery)
        except Exception as e:
            return dbLogger.error(e)
Exemplo n.º 9
0
from mylogging import MyLogger
from db import DB
from SubScraper import gomSubScraper
from EmotionClassifier import emotion_classify

subScrapDataLogFile = 'log/subScrapData.log'
subScrapDataLogger = MyLogger(subScrapDataLogFile)

if __name__ == '__main__':
    gomSubScraper = gomSubScraper()
    db = DB()
    ec = emotion_classify()
    db.makeSubTable()

    gomLastPage = gomSubScraper.getLastPage()
    for page in range(1, gomLastPage + 1):
        sortSmiDictList = gomSubScraper.getSortSmiList(str(page))

        for sortSmiDict in sortSmiDictList:
            smiFileName = list(sortSmiDict.keys())[0]
            smiList = list(sortSmiDict.values())[0]
            for enSmi, korSmi in smiList:
                emotion = ec.classify_text(enSmi)
                subScrapDataLogger.info(smiFileName + enSmi + korSmi + emotion)
                db.insertSubDB((smiFileName, enSmi, korSmi, emotion))