예제 #1
0
# -*- coding: utf-8 -*-
## 7000 word word2Vec example

from dataHandler.MysqlHandler import DataHandler
import gensim

import os
dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa")

document = {}
termList = dataHandler.getTokenTitle(7000)

texts = []      # docuemnt text
ids = []	# document ids
for term in termList :
    if term[1] in document :
        document[term[1]].append(term[2])
    else:
        document[term[1]] = []
        document[term[1]].append(term[2])


for key, item in document.items() :
    texts.append(item)
    ids.append(key)


예제 #2
0
from dataHandler.MysqlHandler import DataHandler
from dataHandler.DataCollector import DataCollector

import os
dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa")


def rssFetch(url_id, rssData):
    linkData = dataHandler.getRssdataByLink(rssData["link"])
    if len(linkData) == 0:
        dataHandler.insertRssData('', rssData["title"], rssData["link"],
                                  rssData["pubDate"], rssData["description"],
                                  url_id)


dataCollector = DataCollector()
urlList = dataHandler.getUrl()

for url in urlList:
    dataCollector.getRss(url[2], url[0], rssFetch)
예제 #3
0
from analysis.Morpheme import Morpheme
from dataHandler.MysqlHandler import DataHandler

import os
dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa")

morpheme = Morpheme()

rssDataList = dataHandler.getRssdata("0")


def isNumber(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def filterToken(var):
    if len(var) > 2 and isNumber(var) == False:
        return True
    return False


for rssData in rssDataList:

    if (rssData[1] != None and rssData[1] != "" and len(rssData[1]) > 2):
예제 #4
0
## 3000 word related document 예제 by LSA

from dataHandler.MysqlHandler import DataHandler
from analysis.lsa.vector_space import VectorSpace

import os
dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa")


termList = dataHandler.getTokenTitle(3000)


## document format : {rss_id : [(id. rss_id, term ).(id. rss_id, term)]}
## ex) {11882: [(1, 11882, '6자수석'), (2, 11882, '평화협정')], 11883: [(3, 11883, '문체부')]}
document = {}

index_to_keyword_mapping = {}

for term in termList :

    if term[1] in document :
        document[term[1]].append(term)
    else:
        document[term[1]] = []
        document[term[1]].append(term)

    keywordIndex = dataHandler.getKeywordIndex(term[2])
    if len(keywordIndex) > 0:
예제 #5
0
# -*- coding: utf-8 -*-

## 7000 word related topic 예제 by LDA

from dataHandler.MysqlHandler import DataHandler
from analysis.util.DocumentUtil import DocumentUtil
from gensim import corpora, models
import gensim

import os
dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa")

document = {}
termList = dataHandler.getTokenTitle(7000)

texts = []  # docuemnt text
ids = []  # document ids
for term in termList:
    if term[1] in document:
        document[term[1]].append(term[2])
    else:
        document[term[1]] = []
        document[term[1]].append(term[2])

for key, item in document.items():
    texts.append(item)
    ids.append(key)
예제 #6
0
파일: LDAProc.py 프로젝트: agune/kisa
# -*- coding: utf-8 -*-

## 7000 word related topic 예제 by LDA

from dataHandler.MysqlHandler import DataHandler
from analysis.util.DocumentUtil import DocumentUtil
from gensim import corpora, models
import gensim

import os
dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa")

document = {}
termList = dataHandler.getTokenTitle(7000)

texts = []      # docuemnt text
ids = []	# document ids
for term in termList :
    if term[1] in document :
        document[term[1]].append(term[2])
    else:
        document[term[1]] = []
        document[term[1]].append(term[2])


for key, item in document.items() :
    texts.append(item)
    ids.append(key)
예제 #7
0
파일: MorphemeProc.py 프로젝트: agune/kisa
from analysis.Morpheme import Morpheme
from dataHandler.MysqlHandler import DataHandler


import os
dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa");


morpheme = Morpheme()

rssDataList = dataHandler.getRssdata("0")


def isNumber(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def filterToken(var):
    if len(var) > 2 and isNumber(var) == False :
        return True
    return False


for rssData in rssDataList :
예제 #8
0
파일: Word2Vec.py 프로젝트: agune/kisa
# -*- coding: utf-8 -*-
## 7000 word word2Vec example

from dataHandler.MysqlHandler import DataHandler
import gensim

import os

dbUser = os.environ["DB_USER"]
dbPw = os.environ["DB_PW"]

dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa")

document = {}
termList = dataHandler.getTokenTitle(7000)

texts = []  # docuemnt text
ids = []  # document ids
for term in termList:
    if term[1] in document:
        document[term[1]].append(term[2])
    else:
        document[term[1]] = []
        document[term[1]].append(term[2])


for key, item in document.items():
    texts.append(item)
    ids.append(key)