class ContentExtractor(object): ''' classdocs ''' ''' steps: get serialized json file ''' @profile def __init__(self, domain, htmlFileURL, pwd, CoreNLPner='', spacyNER='', dbConnection=''): ''' Constructor ''' self.__fileURL = htmlFileURL self.__domainDBkey = domain try: self.__XpathList = pickle.load( open( pwd + '/xpathModels/' + str(self.__domainDBkey) + '.pickle', 'rb')) #=================================================================== # self.__XpathListID = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_ID.pickle', 'rb')) # self.__XpathListNoAttrib = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_NoAttrib.pickle', 'rb')) #=================================================================== except PicklingError, e: print e self.__htmlElements = [ 'body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span' ] self.__htmlAttributes = ['id', 'class'] self.__documentIDKey = '' self.__utilitiesFunctions = utilities() #DB CONNECTIVITY AND FUNCTIONALITY self.__db = connectMySQL(db='xpath', port=3366) self.__topicModel = techDashTopicModel(destination=pwd + '/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T') #======================================================================= # OPEN URL #======================================================================= url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR( self.__fileURL) #======================================================================= # NER #======================================================================= self.__extractNerStanford = CoreNLPner self.__extractNerSpacy = spacyNER
from TechDashAPI.mysqlUtilities import connectMySQL from TechDashAPI.ContentExtractor import ContentExtractor from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer from TechDashAPI.createDOM import createDom from TechDashAPI.util import utilities from TechDashAPI.topicModeling import techDashTopicModel from gensim.models import LdaModel db = connectMySQL(db='xpath', port=3366) filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/' utilitiesFunctions = utilities() modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/' modelName ='fullModel_100P_20T' model = LdaModel.load(modelDestination+modelName+'.lda', mmap=None) topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T') #=============================================================================== # UPDATE ALL ARTICLES TO NEW TOPICS #=============================================================================== sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """ db.executeQuery(sqlQuery) for item in db._connectMySQL__results: #=========================================================================== # print item #=========================================================================== topicModelCat = topicModel.getDocumentTopics(item[1])
def __init__(self, pwd, feedURL=''): #======================================================================= # https://news.ycombinator.com/rss # http://skimfeed.com/tech.html # https://gcn.com/rss-feeds/all.aspx', #======================================================================= if feedURL == '': self.__feedURL = [ 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://www.cnet.com/rss/all/', 'http://www.wired.com/category/gear/feed/', 'http://www.wired.com/category/science/feed/', 'http://www.infoworld.com/index.rss', 'http://www.pcworld.com/index.rss', 'http://www.computerworld.com/index.rss', 'http://www.networkcomputing.com/rss_simple.asp', 'http://www.engadget.com/rss-full.xml', 'http://www.digitaltrends.com/feed/', 'http://www.independent.co.uk/life-style/gadgets-and-tech/rss', 'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'http://feeds.reuters.com/reuters/technologyNews?format=xml', 'http://feeds.reuters.com/reuters/scienceNews', 'http://feeds.bbci.co.uk/news/technology/rss.xml', 'http://feeds.feedburner.com/Technibble', 'http://feeds.feedburner.com/TechCrunch/', 'http://feeds.feedburner.com/techradar/allnews', 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://feeds.arstechnica.com/arstechnica/index?format=xml', 'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews' ] else: self.__feedURL = feedURL if isinstance(self.__feedURL, list): self.__etags = {} for item in self.__feedURL: #=============================================================== # print item #=============================================================== self.__etags[item] = { #item : { 'etag': None, 'modified': None, 'feed': None, 'changed': False #} } else: self.__etags = { self.__feedURL: { 'etag': None, 'modified': None, 'feed': None, 'changed': False } } self.__articleLinks = [] self.__domainDBkey = None self.__db = connectMySQL(db='xpath', port=3366) self.__pwd = pwd self.__filesFolder = pwd + '/xpathModels/' self.__utilitiesFunctions = utilities() #======================================================================= # STANFORD NER #======================================================================= #======================================================================= # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"]) #======================================================================= self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) self.__SpacyNLP = English(data_dir=self.__spacyData_dir)
def __init__(self, domain, htmlFileURL, pwd, dbConnection='', path=''): #======================================================================= # LOGGING INFORMATION #=======================================================================s logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #======================================================================= # INITIAL VARIABLES #======================================================================= self.__domain = str(domain) self.__htmlFileURL = htmlFileURL self.__xpathPaths = [] #======================================================================= # self.__xpathPathsNoAttrib = [] #======================================================================= #======================================================================= # self.__xpathPathsID = [] #======================================================================= #======================================================================= # SET UP THE DIRECTORY STRUCTURE; WHERE THE FILES ARE/WILL BE STORED #======================================================================= self.__dictionaryPath = pwd+'/xpathModels/' self.__domainResourcesFile = self.__dictionaryPath + self.__domain #======================================================================= # DICTIONARY WITH KNOWLEDGE INFORMATION, only need when using 2-step xpath list creation, DEPRECATED #======================================================================= #======================================================================= # self.__domainDoctionary = json.load(open(self.__domainResourcesFile + '.json', 'rb')) # self.__domainDoctionaryXML = ET.parse(self.__domainResourcesFile + '.xml') #======================================================================= #======================================================================= # STRUCTURE AND CONTENT ELEMENTS FOR XPATH CREATION #======================================================================= self.__htmlElements = ['body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span'] self.__htmlAttributes = ['id', 'class'] self.__htmlElementsSkip = ['script','style'] #======================================================================= # LOAD BACKGROUND KNOWLEDGE #======================================================================= try: #======================================================================= # if os.path.isfile(self.__dictionaryPath + self.__domain + '_bckKnowledge.pickle'): #======================================================================= self.__htmlFileBackgroundKnowledge = pickle.load(open(self.__domainResourcesFile + '_bckKnowledge.pickle', 'rb')) except: self.__htmlFileBackgroundKnowledge = {} print traceback.print_exc() #======================================================================= # SET UP K-MEANS AND DEFINE CLUSTER CENTERS #======================================================================= try: #=================================================================== # if os.path.isfile(self.__dictionaryPath + self.__domain + '_centroids.pickle'): #=================================================================== centroids = pickle.load(open(self.__domainResourcesFile + '_centroids.pickle', 'rb')) self.__kMeansValues = KMeans(n_clusters=2, init=centroids) except: #=================================================================== # print traceback.print_exc() #=================================================================== self.__kMeansValues = KMeans(n_clusters=2) #======================================================================= # UTILITIES FUNCTION #======================================================================= self.__utilitiesFunctions = utilities() url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR(self.__htmlFileURL)
def __init__(self, pwd, feedURL= ''): #======================================================================= # https://news.ycombinator.com/rss # http://skimfeed.com/tech.html # https://gcn.com/rss-feeds/all.aspx', #======================================================================= if feedURL == '': self.__feedURL = [ 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://www.cnet.com/rss/all/', 'http://www.wired.com/category/gear/feed/', 'http://www.wired.com/category/science/feed/', 'http://www.infoworld.com/index.rss', 'http://www.pcworld.com/index.rss', 'http://www.computerworld.com/index.rss', 'http://www.networkcomputing.com/rss_simple.asp', 'http://www.engadget.com/rss-full.xml', 'http://www.digitaltrends.com/feed/', 'http://www.independent.co.uk/life-style/gadgets-and-tech/rss', 'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'http://feeds.reuters.com/reuters/technologyNews?format=xml', 'http://feeds.reuters.com/reuters/scienceNews', 'http://feeds.bbci.co.uk/news/technology/rss.xml', 'http://feeds.feedburner.com/Technibble', 'http://feeds.feedburner.com/TechCrunch/', 'http://feeds.feedburner.com/techradar/allnews', 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://feeds.arstechnica.com/arstechnica/index?format=xml', 'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews' ] else: self.__feedURL=feedURL if isinstance(self.__feedURL, list): self.__etags = {} for item in self.__feedURL: #=============================================================== # print item #=============================================================== self.__etags[item] = { #item : { 'etag' : None, 'modified': None, 'feed' : None, 'changed' : False #} } else: self.__etags = { self.__feedURL : { 'etag' : None, 'modified': None, 'feed' : None, 'changed' : False } } self.__articleLinks = [] self.__domainDBkey = None self.__db = connectMySQL(db='xpath', port=3366) self.__pwd = pwd self.__filesFolder = pwd+'/xpathModels/' self.__utilitiesFunctions = utilities() #======================================================================= # STANFORD NER #======================================================================= #======================================================================= # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"]) #======================================================================= self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) self.__SpacyNLP = English(data_dir=self.__spacyData_dir)
def __init__(self, domain, htmlFileURL, pwd, dbConnection='', path=''): #======================================================================= # LOGGING INFORMATION #=======================================================================s logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #======================================================================= # INITIAL VARIABLES #======================================================================= self.__domain = str(domain) self.__htmlFileURL = htmlFileURL self.__xpathPaths = [] #======================================================================= # self.__xpathPathsNoAttrib = [] #======================================================================= #======================================================================= # self.__xpathPathsID = [] #======================================================================= #======================================================================= # SET UP THE DIRECTORY STRUCTURE; WHERE THE FILES ARE/WILL BE STORED #======================================================================= self.__dictionaryPath = pwd + '/xpathModels/' self.__domainResourcesFile = self.__dictionaryPath + self.__domain #======================================================================= # DICTIONARY WITH KNOWLEDGE INFORMATION, only need when using 2-step xpath list creation, DEPRECATED #======================================================================= #======================================================================= # self.__domainDoctionary = json.load(open(self.__domainResourcesFile + '.json', 'rb')) # self.__domainDoctionaryXML = ET.parse(self.__domainResourcesFile + '.xml') #======================================================================= #======================================================================= # STRUCTURE AND CONTENT ELEMENTS FOR XPATH CREATION #======================================================================= self.__htmlElements = [ 'body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span' ] self.__htmlAttributes = ['id', 'class'] self.__htmlElementsSkip = ['script', 'style'] #======================================================================= # LOAD BACKGROUND KNOWLEDGE #======================================================================= try: #======================================================================= # if os.path.isfile(self.__dictionaryPath + self.__domain + '_bckKnowledge.pickle'): #======================================================================= self.__htmlFileBackgroundKnowledge = pickle.load( open(self.__domainResourcesFile + '_bckKnowledge.pickle', 'rb')) except: self.__htmlFileBackgroundKnowledge = {} print traceback.print_exc() #======================================================================= # SET UP K-MEANS AND DEFINE CLUSTER CENTERS #======================================================================= try: #=================================================================== # if os.path.isfile(self.__dictionaryPath + self.__domain + '_centroids.pickle'): #=================================================================== centroids = pickle.load( open(self.__domainResourcesFile + '_centroids.pickle', 'rb')) self.__kMeansValues = KMeans(n_clusters=2, init=centroids) except: #=================================================================== # print traceback.print_exc() #=================================================================== self.__kMeansValues = KMeans(n_clusters=2) #======================================================================= # UTILITIES FUNCTION #======================================================================= self.__utilitiesFunctions = utilities() url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR( self.__htmlFileURL)