# -*- coding: utf-8 -*- ## 7000 word word2Vec example from dataHandler.MysqlHandler import DataHandler import gensim import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa") document = {} termList = dataHandler.getTokenTitle(7000) texts = [] # docuemnt text ids = [] # document ids for term in termList : if term[1] in document : document[term[1]].append(term[2]) else: document[term[1]] = [] document[term[1]].append(term[2]) for key, item in document.items() : texts.append(item) ids.append(key)
from dataHandler.MysqlHandler import DataHandler from dataHandler.DataCollector import DataCollector import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa") def rssFetch(url_id, rssData): linkData = dataHandler.getRssdataByLink(rssData["link"]) if len(linkData) == 0: dataHandler.insertRssData('', rssData["title"], rssData["link"], rssData["pubDate"], rssData["description"], url_id) dataCollector = DataCollector() urlList = dataHandler.getUrl() for url in urlList: dataCollector.getRss(url[2], url[0], rssFetch)
from analysis.Morpheme import Morpheme from dataHandler.MysqlHandler import DataHandler import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa") morpheme = Morpheme() rssDataList = dataHandler.getRssdata("0") def isNumber(s): try: float(s) return True except ValueError: return False def filterToken(var): if len(var) > 2 and isNumber(var) == False: return True return False for rssData in rssDataList: if (rssData[1] != None and rssData[1] != "" and len(rssData[1]) > 2):
## 3000 word related document 예제 by LSA from dataHandler.MysqlHandler import DataHandler from analysis.lsa.vector_space import VectorSpace import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa") termList = dataHandler.getTokenTitle(3000) ## document format : {rss_id : [(id. rss_id, term ).(id. rss_id, term)]} ## ex) {11882: [(1, 11882, '6자수석'), (2, 11882, '평화협정')], 11883: [(3, 11883, '문체부')]} document = {} index_to_keyword_mapping = {} for term in termList : if term[1] in document : document[term[1]].append(term) else: document[term[1]] = [] document[term[1]].append(term) keywordIndex = dataHandler.getKeywordIndex(term[2]) if len(keywordIndex) > 0:
# -*- coding: utf-8 -*- ## 7000 word related topic 예제 by LDA from dataHandler.MysqlHandler import DataHandler from analysis.util.DocumentUtil import DocumentUtil from gensim import corpora, models import gensim import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa") document = {} termList = dataHandler.getTokenTitle(7000) texts = [] # docuemnt text ids = [] # document ids for term in termList: if term[1] in document: document[term[1]].append(term[2]) else: document[term[1]] = [] document[term[1]].append(term[2]) for key, item in document.items(): texts.append(item) ids.append(key)
# -*- coding: utf-8 -*- ## 7000 word related topic 예제 by LDA from dataHandler.MysqlHandler import DataHandler from analysis.util.DocumentUtil import DocumentUtil from gensim import corpora, models import gensim import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa") document = {} termList = dataHandler.getTokenTitle(7000) texts = [] # docuemnt text ids = [] # document ids for term in termList : if term[1] in document : document[term[1]].append(term[2]) else: document[term[1]] = [] document[term[1]].append(term[2]) for key, item in document.items() : texts.append(item) ids.append(key)
from analysis.Morpheme import Morpheme from dataHandler.MysqlHandler import DataHandler import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa"); morpheme = Morpheme() rssDataList = dataHandler.getRssdata("0") def isNumber(s): try: float(s) return True except ValueError: return False def filterToken(var): if len(var) > 2 and isNumber(var) == False : return True return False for rssData in rssDataList :
# -*- coding: utf-8 -*- ## 7000 word word2Vec example from dataHandler.MysqlHandler import DataHandler import gensim import os dbUser = os.environ["DB_USER"] dbPw = os.environ["DB_PW"] dataHandler = DataHandler("localhost", dbUser, dbPw, "kisa") document = {} termList = dataHandler.getTokenTitle(7000) texts = [] # docuemnt text ids = [] # document ids for term in termList: if term[1] in document: document[term[1]].append(term[2]) else: document[term[1]] = [] document[term[1]].append(term[2]) for key, item in document.items(): texts.append(item) ids.append(key)