예제 #1
0
def getText(nodelist):
    rc = ""
    for node in nodelist:
        try:
            rc += node.data.encode("utf8")
        except Exception, msg:
            getLogger().error(msg)
예제 #2
0
def handleStringfilter(org_string, filter):

	ret_text = org_string

	try:
		if filter.startswith("****"):
			sep = filter[4:].strip()
			if org_string.count(sep) > 0:
				ret_text = org_string[:org_string.rfind(sep)]
		elif filter.endswith("****"):
			sep = filter[:-4].strip()
			if org_string.count(sep) > 0:
				ret_text = org_string[org_string.find(sep)+len(sep):].strip()
		elif filter.count("****") > 0:
			pieces = filter.split("****")
			s_cur = org_string.find(pieces[0].strip())
			e_cur = org_string.rfind(pieces[1].strip())
			if s_cur >=0 and e_cur > s_cur:
				ret_text = org_string[s_cur+len(pieces[0].strip()):e_cur]
		else:
			del_pieces = filter.split(";")
			for piece in del_pieces:
				org_string = org_string.replace(piece, "")
			ret_text = org_string
	except Exception, msg:
		getLogger().error(msg)
예제 #3
0
def getRssInfo(rss):
	downLoader = Downloader()
	try:
		(t_url, header, html) = downLoader.open(rss) 
		print "download ", rss
	except Exception, msg:
		getLogger().error("feed download error : %s %s", msg, rss)
		return None
예제 #4
0
    def __init__(self):
        self.commands = CommandManager()
        self.event_manager = EventManager()
        self.logger = getLogger("Manager")
        self.plugman = PluginManager(self)
        self.yapsy_logger = getLogger("yapsy")

        self.metrics = None
예제 #5
0
    def __init__(self):
        self.commands = CommandManager()
        self.event_manager = EventManager()
        self.logger = getLogger("Manager")
        self.plugman = PluginManager(self)
        self.yapsy_logger = getLogger("yapsy")

        self.metrics = None
예제 #6
0
    def parse(self, contents, temp):
        # resultReturn
        result_dict = dict()

        for field in ["title", "link", "image", "generator", "language", "description", "writer"]:
            result_dict[field] = ""

        try:
            self.dom = xml.dom.minidom.parseString(contents)
            self.title = getText(self.dom.getElementsByTagName("title")[0].childNodes)

            result_dict["title"] = self.title

            if len(self.dom.getElementsByTagName("link")) > 0:
                result_dict["link"] = getText(self.dom.getElementsByTagName("link")[0].childNodes).strip()

            if len(self.dom.getElementsByTagName("image")) > 0:
                result_dict["image"] = getText(
                    self.dom.getElementsByTagName("image")[0].getElementsByTagName("url")[0].childNodes
                )

            if len(self.dom.getElementsByTagName("generator")) > 0:
                result_dict["generator"] = getText(self.dom.getElementsByTagName("generator")[0].childNodes)

            if result_dict["generator"].find("wordpress") >= 0:
                return self.parseWordPress(contents)
            if result_dict["generator"].lower().find("blogger") >= 0:
                return self.parseBlogspot(contents)

            if len(self.dom.getElementsByTagName("language")) > 0:
                result_dict["language"] = getText(self.dom.getElementsByTagName("language")[0].childNodes)
            if len(self.dom.getElementsByTagName("description")) > 0:
                result_dict["description"] = getText(self.dom.getElementsByTagName("description")[0].childNodes)
            try:

                if len(self.dom.getElementsByTagName("managingEditor")) > 0:
                    result_dict["writer"] = getText(self.dom.getElementsByTagName("managingEditor")[0].childNodes)
                elif len(self.dom.getElementsByTagName("webMaster")) > 0:
                    result_dict["writer"] = getText(self.dom.getElementsByTagName("webMaster")[0].childNodes)
                else:
                    tt_list = self.dom.getElementsByTagName("author")

                    try:
                        if len(tt_list) > 0:
                            writer = getText(tt_list[0].getElementsByTagName("name")[0].childNodes)
                            if writer != "":
                                result_dict["writer"] = writer
                                tt_node = self.dom.getElementsByTagName("author")[0].getElementsByTagName("gd:image")[0]
                                image = tt_node.attributes["src"].value.encode("utf8")
                                if image != "":
                                    result_dict["image"] = image
                    except Exception, msg:
                        pass
            except Exception, msg:
                getLogger().error(msg)

        except Exception, msg:
            getLogger().error(msg)
예제 #7
0
def getTistoryId(url):
	downLoader = Downloader()
	attr_dict = dict()
	attr_dict["tid"] = "livere_blogurl = '****.tistory.com';"
	attr_dict["tid2"] = """__addParam("author","****");"""
	try:
		(t_url, header, html) = downLoader.open(url) 
		print "download", url
	except Exception, msg:
		getLogger().error("feed download error : %s %s", msg, rss)
		return None
예제 #8
0
def getDBConnection(host, usr, pwd, db, cursor_type="normal"):
	db_connect = None
	db_cursor = None
	try:
		if cursor_type == "dict":
			db_connect = MySQLdb.connect(host, usr, pwd, db, cursorclass=MySQLdb.cursors.DictCursor)
		else:
			db_connect = MySQLdb.connect(host, usr, pwd, db)
		db_connect.set_character_set('utf8')
		db_cursor = db_connect.cursor()
	except Exception, msg:
		getLogger().error("getDBCursor() Failed : %s"%msg)
예제 #9
0
파일: db_util.py 프로젝트: HDNua/crawlpy
def getDBConnection(host, usr, pwd, db, cursor_type="normal"):
    db_connect = None
    db_cursor = None
    try:
        if cursor_type == "dict":
            db_connect = MySQLdb.connect(
                host, usr, pwd, db, cursorclass=MySQLdb.cursors.DictCursor)
        else:
            db_connect = MySQLdb.connect(host, usr, pwd, db)
        db_connect.set_character_set('utf8')
        db_cursor = db_connect.cursor()
    except Exception, msg:
        getLogger().error("getDBCursor() Failed : %s" % msg)
예제 #10
0
def makeOutputDict(document_data):

    try:

        if document_data.parsing_result:
            result_dict = document_data.parsing_result
        else:
            result_dict = dict()

        result_dict["mode"] = document_data.mode
        result_dict["type"] = document_data.type
        result_dict["guid"] = document_data.guid
        result_dict["crawlTime"] = document_data.crawl_time

        if "body" in result_dict:
            result_dict["body"] = result_dict["body"].replace("]", "]")
        if "bodyHtml" in result_dict:
            result_dict["bodyHtml"] = result_dict["bodyHtml"].replace("]", "]")
        if "title" in result_dict:
            result_dict["title"] = " ".join(result_dict["title"].replace("]", "]").split())

        result_dict["webLink"] = document_data.down_url
        result_dict["mobileLink"] = document_data.mobile_url

        if document_data.type == "NEWS":
            result_dict["channelName"] = document_data.domain_data.name
            result_dict["channelIdentifier"] = document_data.domain_data.url
            result_dict["sourceType"] = 4
            try:
                image_data = document_data.image_data
                if "78x78" in image_data:
                    result_dict["imageThumbnail78x78"] = image_data["78x78"]
                if "126x126" in image_data:
                    result_dict["imageThumbnail126x126"] = image_data["126x126"]
                if "signature" in image_data:
                    result_dict["imageThumbnailSignature"] = image_data["signature"]
            except Exception, msg:
                getLogger().error(msg)
        elif document_data.type == "BBS":
            for int_field in ["readCount", "replyCount", "recommendCount", "videoCount", "imageCount"]:
                if int_field not in result_dict:
                    result_dict[int_field] = 0
            result_dict["siteName"] = document_data.domain_data.name
            result_dict["siteIdentifier"] = document_data.domain_data.url
            outLinks = list()
            for link in result_dict["bodyLinks"]:
                l_data = result_dict["bodyLinks"][link]
                outLinks.append("%s\t%s" % (link, l_data.text))
            result_dict["outLinks"] = "\n".join(outLinks)
예제 #11
0
    def sendData(self, data, sc="NEWS"):
        try:

            if sc != "NEWS":
                return "NO SERVICE"

            res = self.producer.send_messages(sc, data)
            if str(res).find("error=0") >= 0:
                return "OK"
            else:
                getLogger().error(str(res))
                return "ERROR"
        except Exception, msg:
            getLogger().error(msg)
            return "ERROR"
예제 #12
0
	def __init__(self, _outputDirPath=os.getcwd(), _documentCountLimit=1000):
		self.fieldListDic = dict()
		self.outputDirPath = _outputDirPath
		self.documentCountLimit = _documentCountLimit
		self.logger = getLogger()
		self.setupOutputDir()
		self.initSCFieldListDic()
예제 #13
0
    def __init__(self):
        self.log = getLogger("Updates")
        self.current = current
        self.current_v = StrictVersion(current)

        self.load_release()
        self.do_warnings()
예제 #14
0
 def __init__(self, _outputDirPath=os.getcwd(), _documentCountLimit=1000):
     self.fieldListDic = dict()
     self.outputDirPath = _outputDirPath
     self.documentCountLimit = _documentCountLimit
     self.logger = getLogger()
     self.setupOutputDir()
     self.initSCFieldListDic()
예제 #15
0
	def _getRules(self, url, verbose=False):
		"""
		Returns the RobotTextRules object for url(site-level or dir-level)
		First:  use internal cache
		Second: use memcache
		Third:  download robots.txt and parsing 
		"""
		logger = log.getLogger()

		# 1. use stored robots dictionary cache
		robots_site_path = urlparse.urljoin(url, "/robots.txt")	# Then the site-level
		if robots_site_path in self.robots:
			if verbose:
				logger.info("robotstxt in local memory: %s", robots_site_path)
			return self.robots[robots_site_path]
		
		# 2. use memcache
		rules = None
		try:
			# 3. download robots text
			rules = self._parsingRobotsFile(robots_site_path)	# First try site-level
			if verbose:
				logger.info("robotstxt downloaded: %s: %s", rules.return_code, robots_site_path)
			self.robots[robots_site_path] = rules

		except:
			pass

		return rules
예제 #16
0
def isOldImage(chk_api, hash_key):

	try:
		cmd = "curl %s --connect-timeout 5 --max-time 10 "%chk_api
		exist = True
		fd = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
		for line in fd.stdout.readlines():
			results = line.strip()
			if results.find("404") >= 0:
				exist = False
				break
			if results.find(hash_key.upper()) >= 0:
				getLogger().info("same hash")
				return True
	except Exception, msg:
		getLogger().error(msg)
예제 #17
0
	def parse(self, header, html, url, parser_id=None):

		if self.prm == None:
			self.setRules()

		ret_dict = self.parser.plugParser(header,  html, url)

		result_dict = dict()
		if parser_id != None and parser_id in self.prm.id_dict:
			try:
				host_rule = self.prm.id_dict[parser_id]
				result_dict = self.getDataByRule(host_rule, ret_dict, url)
				result_dict["parser_id"] = parser_id
				return result_dict
			except Exception, msg:
				getLogger().error(msg)
예제 #18
0
def uploadImage(upload_url, file):
	retry = 0
	results = ""
	m_t = time.time()
	while retry < 3:
		try:
			cmd = "curl --upload-file %s %s --connect-timeout 5 --max-time 10 --header 'Expect:' "%(file, upload_url+"/fileext/jpg")
			fd = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
			for line in fd.stdout.readlines():
				results = line.strip()
				if results.startswith("OK"):
					e_t = time.time()
					getLogger().info("%s upload time :  %s", upload_url, e_t - m_t)
					return "OK"
		except Exception, msg:
			getLogger().error(msg)
		retry += 1
예제 #19
0
def downloadImage(img_url):
	rq = mechanize.Request(img_url)
	try:
		rs = mechanize.urlopen(rq)
		http_content = rs.read()
		header = rs.info()
		return http_content
	except Exception, msg:
		try:
			getLogger().error("%s %s", img_url, msg)
			time.sleep(1)
			rs = mechanize.urlopen(rq)
			http_content = rs.read()
			return http_content
		except Exception, msg:
			getLogger().error("%s %s", img_url, msg)
			return str(msg)
예제 #20
0
파일: manager.py 프로젝트: NotAFile/Ultros
 def __init__(self):
     self.logger = getLogger("Help")
     self.add_topic(
         AliasListTopic("aliases", HelpTopicType.GenericTopic)
     )
     self.add_topic(
         ComandListTopic("commands", HelpTopicType.GenericTopic)
     )
예제 #21
0
파일: config.py 프로젝트: NotAFile/Ultros
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("YamlConfig")
        # Some sanitizing here to make sure people can't escape the config dirs
        filename = filename.strip("..")
        self.filename = filename
        self.exists = self.reload(False)
예제 #22
0
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("YamlConfig")
        # Some sanitizing here to make sure people can't escape the config dirs
        filename = filename.strip("..")
        self.filename = filename
        self.exists = self.reload(False)
예제 #23
0
파일: manager.py 프로젝트: NotAFile/Ultros
    def __init__(self, factory_manager=None,
                 path="./plugins", module="plugins"):
        if factory_manager is None:
            raise ValueError("Factory manager cannot be None!")

        self.log = getLogger("Plugins")

        self.factory_manager = factory_manager

        self.module = module
        self.path = path
예제 #24
0
def makeUrlFromPattern(patterns, ret_key_dict):
	try:
		for ret_url in patterns:
			anypath_str = ""
			for kk in ret_key_dict:
				if (kk in INT_KEY or kk.startswith("INT_")) and  not isInt(ret_key_dict[kk]) :
					break
				if kk == "ANYPATH" and ret_key_dict[kk].strip() == "":
					ret_url = ret_url.replace("/(ANYPATH)/", "/")
				else:
					ret_url = ret_url.replace("("+kk+")", ret_key_dict[kk])
					if kk == "ANYPATH":
						anypath_str = "/"+ret_key_dict[kk]

			if ret_url.count("(") == 0 and ret_url.count(")") == 0:
				ret_url = quote(ret_url.strip(), safe=RESERVED)
				ret_url = ret_url.replace("://m.www.","://m.").replace(anypath_str, "")
				return ret_url
	except Exception, msg:
		getLogger().error(msg)
예제 #25
0
    def writeEachDocumentData(self, f, dataDic, type):

        try:
            if type == "test":
                f.write("%s\t%s\t%s\n" % (dataDic["guid"], dataDic["title"], dataDic["bodyHtml"]))
            else:
                keyList = None
                if "type" in dataDic:
                    scType = dataDic["type"]
                    if scType in self.fieldListDic:
                        keyList = self.fieldListDic[scType]

                if keyList:  # 필드명 지정
                    for field in keyList:
                        if field in dataDic and dataDic[field] and dataDic[field] != "":
                            f.write("		<%s><![CDATA[%s]]></%s>\n" % (field, dataDic[field], field))
                else:  # 필드명 미지정(모두출력)
                    for key, val in dataDic.items():
                        f.write("		<%s><![CDATA[%s]]></%s>\n" % (key, val, key))
        except Exception, msg:
            getLogger().error("%s %s FILE WRITE ERROR", msg, dataDic)
예제 #26
0
    def __init__(self, name, factory, config):
        NoChannelsProtocol.__init__(self, name, factory, config)

        self.log = getLogger(self.name)
        self.event_manager = EventManager()
        self.command_manager = CommandManager()

        reactor.connectTCP(
            self.config["connection"]["host"],
            self.config["connection"]["port"],
            self.factory,
            120
        )
예제 #27
0
    def __init__(self,
                 factory_manager=None,
                 path="./plugins",
                 module="plugins"):
        if factory_manager is None:
            raise ValueError("Factory manager cannot be None!")

        self.log = getLogger("Plugins")

        self.factory_manager = factory_manager

        self.module = module
        self.path = path
예제 #28
0
	def __init__(self, init_dic):
		self.info_dic = dict()
		self.logger = getLogger()
		if self.isValidInfo(init_dic):
			self.info_dic = init_dic
		else:
			self.logger.error("Invalid init information")
			exit(1)
		self.url_parser = URLParser()  # 가장 기본형태의 URL 파서

		self.pattern_dic = {"normal":dict(), "host_key":dict()}  # 모든 패턴정보를 담을 Dictionary
		# dic["normal" | "hostkey"][domain] = [(priority, URLData()), ...]

		self.build_pt_dic = dict()  # 추출해낸 예약어로 다른 원하는 여러가지의 URL을 만들어낼 수 있다.
예제 #29
0
파일: factory.py 프로젝트: gsingh123/Ultros
 def __init__(self, protocol_name, config, manager):
     self.logger = getLogger("F: %s" % protocol_name)
     self.config = config
     self.manager = manager
     self.name = protocol_name
     self.ptype = config["main"]["protocol-type"]
     self.protocol_class = None
     self.protocol = None
     manager_config = manager.main_config
     reconnections = manager_config["reconnections"]
     self.r_delay = int(reconnections["delay"])
     self.r_attempts = int(reconnections["attempts"])
     self.r_on_drop = reconnections["on-drop"]
     self.r_on_failure = reconnections["on-failure"]
     self.r_reset = reconnections["reset-on-success"]
예제 #30
0
파일: factory.py 프로젝트: NotAFile/Ultros
 def __init__(self, protocol_name, config, manager):
     self.logger = getLogger("F: %s" % protocol_name)
     self.config = config
     self.manager = manager
     self.name = protocol_name
     self.ptype = config["main"]["protocol-type"]
     self.protocol_class = None
     self.protocol = None
     manager_config = manager.main_config
     reconnections = manager_config["reconnections"]
     self.r_delay = int(reconnections["delay"])
     self.r_attempts = int(reconnections["attempts"])
     self.r_on_drop = reconnections["on-drop"]
     self.r_on_failure = reconnections["on-failure"]
     self.r_reset = reconnections["reset-on-success"]
예제 #31
0
파일: data.py 프로젝트: NotAFile/Ultros
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("Data")
        filename = filename.strip("..")

        folders = filename.split("/")
        folders.pop()
        folders = "/".join(folders)

        if not os.path.exists(folders):
            os.makedirs(folders)

        self.filename = filename
        self.reload(False)
예제 #32
0
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("Data")
        filename = filename.strip("..")

        folders = filename.split("/")
        folders.pop()
        folders = "/".join(folders)

        if not os.path.exists(folders):
            os.makedirs(folders)

        self.filename = filename
        self.reload(False)
예제 #33
0
파일: data.py 프로젝트: NotAFile/Ultros
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("Redis")

        self.path = path
        self.url = kwargs.get("url", None)

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        self.args = args
        self.kwargs = kwargs

        self.reconnect()
예제 #34
0
	def isDisallowSite(self, url, verbose=False):
		"""
		robots.txt가 아래 문구를 포함하면 True를 리턴.
			User-agent: * or zumbot
			Disallow: /
		"""
		logger = log.getLogger()
		self.delay = 3
		robots_site_path = urlparse.urljoin(url, "/robots.txt")	# Then the site-level
		# 3. download robots text
		self.blocked = False
		rules = self._parsingRobotsFile(robots_site_path)	# First try site-level
		if self.blocked:
			return True, self.delay
		else:
			return False, self.delay
예제 #35
0
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("Redis")

        self.path = path
        self.url = kwargs.get("url", None)

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        self.args = args
        self.kwargs = kwargs

        self.reconnect()
예제 #36
0
파일: console.py 프로젝트: gsingh123/Ultros
    def __init__(self):
        if not sys.stdout.isatty() or "--no-console" in sys.argv:
            self.wrapped = False
            return

        self.logger = getLogger("Console")

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        self.wrapper = Wrapper(self)
        self.wrapper_err = WrapperErr(self)
        self.reader = Reader(self)

        # We set it here
        sys.stdout = self.wrapper
        sys.stderr = self.wrapper_err
예제 #37
0
파일: console.py 프로젝트: gsingh123/Ultros
    def __init__(self):
        if not sys.stdout.isatty() or "--no-console" in sys.argv:
            self.wrapped = False
            return

        self.logger = getLogger("Console")

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        self.wrapper = Wrapper(self)
        self.wrapper_err = WrapperErr(self)
        self.reader = Reader(self)

        # We set it here
        sys.stdout = self.wrapper
        sys.stderr = self.wrapper_err
예제 #38
0
    def __init__(self, name, factory, config):
        self.name = name
        self.factory = factory
        self.config = config

        self.received = ""
        self.log = getLogger(self.name)
        self.log.info("Setting up..")

        self.command_manager = CommandManager()
        self.event_manager = EventManager()

        self.username = config["identity"]["username"]
        self.password = config["identity"]["password"]
        self.networking = config["network"]
        self.tokens = config["identity"]["tokens"]

        self.control_chars = config["control_chars"]

        audio_conf = config.get("audio", {})
        self.should_mute_self = audio_conf.get("should_mute_self", True)
        self.should_deafen_self = audio_conf.get("should_deafen_self", True)

        event = general_events.PreConnectEvent(self, config)
        self.event_manager.run_callback("PreConnect", event)

        context = self._get_client_context()
        if context is None:
            # Could not create a context (problem loading cert file)
            self.factory.manager.remove_protocol(self.name)
            return

        reactor.connectSSL(
            self.networking["address"],
            self.networking["port"],
            self.factory,
            context,
            120
        )

        event = general_events.PostConnectEvent(self, config)
        self.event_manager.run_callback("PostConnect", event)
예제 #39
0
    def __init__(self, factory, config):
        self.factory = factory
        self.config = config
        self.log = getLogger("TS3")

        self.log.info("Setting up..")

        self.server = config["server"]
        self.identity = config["identity"]

        self.user = self.identity["username"]
        self.passw = self.identity["password"]
        self.sid = self.server["sid"]

        reactor.connectTCP(
            self.server["address"],
            self.server["port"],
            self.factory,
            120
        )
예제 #40
0
파일: data.py 프로젝트: NotAFile/Ultros
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("DBAPI")

        path = path.replace("//", "/")
        path = path.split("/", 1)[1]

        self.path = path

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        parsed_module = path.split(":", 1)[0]
        self.parsed_module = parsed_module
        self.args = args
        self.kwargs = kwargs

        self.logger.debug(_("Parsed module: %s") % parsed_module)

        self.reconnect()
예제 #41
0
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("DBAPI")

        path = path.replace("//", "/")
        path = path.split("/", 1)[1]

        self.path = path

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        parsed_module = path.split(":", 1)[0]
        self.parsed_module = parsed_module
        self.args = args
        self.kwargs = kwargs

        self.logger.debug(_("Parsed module: %s") % parsed_module)

        self.reconnect()
예제 #42
0
    def set_language(self, lang=None, mlang=None):
        if lang is None:
            lang = DEFAULT

        if mlang is None:
            mlang = DEFAULT

        self.get_known()

        if self.log and self.logger is None:
            from utils.log import getLogger

            self.logger = getLogger("Translations")

        if lang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (lang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'" %
                                 (lang, DEFAULT))

            lang = DEFAULT

        if mlang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (mlang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'" %
                                 (mlang, DEFAULT))

            mlang = DEFAULT

        self.language = lang
        self.m_language = mlang
        self.reload()
예제 #43
0
    def set_language(self, lang=None, mlang=None):
        if lang is None:
            lang = DEFAULT

        if mlang is None:
            mlang = DEFAULT

        self.get_known()

        if self.log and self.logger is None:
            from utils.log import getLogger

            self.logger = getLogger("Translations")

        if lang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (lang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'"
                                 % (lang, DEFAULT))

            lang = DEFAULT

        if mlang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (mlang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'"
                                 % (mlang, DEFAULT))

            mlang = DEFAULT

        self.language = lang
        self.m_language = mlang
        self.reload()
예제 #44
0
    def __init__(self):
        self.logger = getLogger("Permissions")

        self.confdir = tmpdir + "/config/"
        self.datadir = tmpdir + "/data/"

        try:
            os.makedirs(self.confdir)
            os.makedirs(self.datadir)
            self.logger.debug("Config and data dirs created.")
        except Exception:
            pass

        yaml.dump({"editor_warning": False},
                  open(self.confdir + "settings.yml", "w"))

        self.storage = StorageManager(self.confdir, self.datadir)

        self.data = self.storage.get_file(self, "data", formats.YAML,
                                          "permissions.yml")

        self.handler = permissionsHandler(self, self.data)

        super(PluginObject, self).__init__()
예제 #45
0
import logging
import os
import pickle
from pprint import pprint

import numpy as np
import pandas as pd
import zerorpc
from sklearn.linear_model import SGDClassifier
from sqlalchemy import update

from model.db import DB_ENGINE, rawcontents
from utils.log import getLogger

logger = getLogger('semiTrain')


def fetchAllData(threshold):
    return pd.read_sql(
        'SELECT rid, tag, assure FROM rawcontents WHERE LENGTH(content) > {}'.
        format(threshold), DB_ENGINE)


def randomSelectData(data, count):
    inds = np.arange(len(data))
    np.random.shuffle(inds)
    return data.iloc[inds[:count]].copy()


def completeTrainData(current):
    stmt = 'SELECT rid, content, vector FROM rawcontents WHERE rid IN {}'
예제 #46
0
    def __init__(self, data_dict):
        self.callbacks = []

        self.logger = getLogger("Data")
        self.data = data_dict
예제 #47
0
    def __init__(self):
        self.log = getLogger("GetchUnix")
        import tty
        import sys

        self.log.trace(_("Loaded: %s, %s") % (tty, sys))
예제 #48
0
파일: db.py 프로젝트: LuminousXLB/CrawlerTY
from collections import namedtuple

from sqlalchemy import MetaData, create_engine
from sqlalchemy.schema import (Column, ForeignKey, ForeignKeyConstraint, Index,
                               PrimaryKeyConstraint, Table, UniqueConstraint)
from sqlalchemy.types import BLOB, Boolean, DateTime, Float, Integer, String

from settings import DB_ENGINE_FILE, ECHO_DATABASE_INFO
from utils.log import getLogger

logger = getLogger('db')

# models

DB_ENGINE = create_engine('sqlite:///{}'.format(DB_ENGINE_FILE),
                          echo=ECHO_DATABASE_INFO)

metadata = MetaData()

posts = Table(
    'posts', metadata, Column('pid', Integer, autoincrement=True),
    Column('blockid', String, nullable=False, comment='板块id'),
    Column('postid', Integer, nullable=False, comment='帖子id'),
    Column('title', String, nullable=False, comment='帖子标题'),
    Column('pageurl', String, nullable=False, comment='帖子首页url'),
    Column('subType', String, comment='帖子子类型'),
    Column('activityuserid', Integer, nullable=False, comment='楼主id'),
    Column('clickcount', Integer, nullable=False, comment='点击数'),
    Column('replycount', Integer, nullable=False, comment='回复数'),
    Column('remarkcount', Integer, nullable=False, comment='楼主发言数'),
    Column('imgcount', Integer, nullable=False, comment='图片数'),
예제 #49
0
파일: tokens.py 프로젝트: gsingh123/Ultros
    def __init__(self):
        self.token_regex = re.compile(r"\{[^}]*\}")
        self.parse_regex = re.compile(r"(?<!\\):")
        self.escape_regex = re.compile(r"\\:")

        self.logger = getLogger("Tokens")
예제 #50
0
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from sqlalchemy import select

from model.db import DB_ENGINE, posts, rawcontents
from utils.log import getLogger

logger = getLogger('doc2vec')

# init

with DB_ENGINE.connect() as conn:
    s = select([posts.c.title])
    documents = [
        TaggedDocument(doc, [pid]) for pid, doc in enumerate(conn.execute(s))
    ]

model = Doc2Vec(documents, vector_size=2048, window=5, min_count=1, workers=4)

model.save('RuntimeTY/d2v_2048_5_1216')

model = Doc2Vec.load('RuntimeTY/d2v_2048_5_1216')

logger.critical('Loaded')

with DB_ENGINE.connect() as conn:
    s = select([rawcontents])
    buffer = []

    for row in conn.execute(s):
        rid = row[rawcontents.c.rid]
예제 #51
0
 def __init__(self):
     self.logger = getLogger("Help")
     self.add_topic(AliasListTopic("aliases", HelpTopicType.GenericTopic))
     self.add_topic(ComandListTopic("commands", HelpTopicType.GenericTopic))
예제 #52
0
    def __init__(self):
        self.log = getLogger("GetchWindows")
        import msvcrt

        self.log.trace(_("Loaded: %s") % msvcrt)
예제 #53
0
import logging
import re
from urllib import parse

import demjson

from model.db import Reply
from utils.datetime import parseDatetimeString
from utils.log import getLogger
from utils.request import getSoup, with_max_retries

logger = getLogger('parser', logging.INFO)


class FetchPostFailed(BaseException):
    pass


# app


@with_max_retries(3, 10)
def extractAll(blockid, postid):
    url = urlFactory(blockid, postid, 1)
    soup = getPage(url)
    if soup == None:
        return None

    bbsGlobal = extractBBSGlobal(soup)
    if bbsGlobal['isWenda'] or bbsGlobal['subType'] == '本版隐藏':
        logger.info('Got subtype with {} in {}'.format(bbsGlobal['subType'],
예제 #54
0
from utils.log import getLogger
import pandas as pd
from model.db import DB_ENGINE
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pickle
import os
from sklearn import ensemble, svm

logger = getLogger('predict')

labeled = pd.read_sql(
    'SELECT rid, tag, vector FROM rawcontents WHERE assure>0.5', DB_ENGINE)

train = labeled.groupby('vector')['tag'].mean()

X = list(pd.Series(train.index).apply(pickle.loads))
y = list(train.values)

logger.info('Build the model')

clf = svm.SVC(kernel='rbf')

logger.info('Fitting...')

clf.fit(X, y)

with open(DATA_ROOT / 'SVC_rbf_model.bin', 'wb') as f:
    pickle.dump(clf, f)
예제 #55
0
import pymysql
import configparser
import logging
from argon2 import PasswordHasher
from classes.User import User
from classes.PSAlbum import PSAlbum
from utils.log import getConsoleHandler, getFileHandler, getLogger
import time


psLogger = getLogger(__name__, "logs/photoshare.log")
psLogger.debug("Loading DBConnection class")

class dbConnection:

    USERNAME = ''
    PASSWORD = ''
    HOST = ''
    DATABASE_NAME = ''
    CHARSET = ''
    SQL_CONNECTION = ''


    def __init__(self, settings):
        self.HOST           = settings.get('SQL', 'host')
        self.USERNAME       = settings.get('SQL', 'user')
        self.PASSWORD       = settings.get('SQL', 'password')
        self.DATABASE_NAME  = settings.get('SQL', 'dbName')
        self.CHARSET        = settings.get('SQL', 'charset')

예제 #56
0
from utils.log import getLogger, logging
from utils.request import fetchJson, with_max_retries
import json

logger = getLogger('reward', logging.INFO)


class FetchRewardInfoFailed(BaseException):
    pass


# app


def fetchRewardInfo(bbsGlobal):
    form = {
        'method': 'bbs.api.getArticleDashangInfo',
        'params.item': bbsGlobal['item'],
        'params.articleId': bbsGlobal['artId'],
    }

    for kform, kglob in {
            'params.rewardIds': 'tyfen_rewardIds',
            'params.tyfIds': 'tyfen_tyfIds',
            'params.shangIds': 'shangIds'
    }.items():
        if len(bbsGlobal[kglob]) > 0 and bbsGlobal[kglob][0] == '0':
            form[kform] = bbsGlobal[kglob]
        else:
            form[kform] = '0,' + bbsGlobal[kglob]
예제 #57
0
    def __init__(self, data_dict):
        self.callbacks = []

        self.logger = getLogger("MemoryConfig")
        self.exists = True
        self.data = data_dict
예제 #58
0
Author: Hai Liang Wang <*****@*****.**>
'''

import os
import tensorflow as tf
import shutil
from config import Config
from tqdm import tqdm
from utils import log
from munch import munchify
from models.rnn import Model
from dataset.textdata import TextData
from time import localtime, strftime

config = Config()
logger = log.getLogger(__name__)


def main(unused_argv):
    batch_data = TextData(
        munchify({
            'rootDir': config.root_dir,
            'corpus': config.corpus_name,
            'maxLength': config.train_max_length,
            'maxLengthEnco': config.train_max_length_enco,
            'maxLengthDeco': config.train_max_length_deco,
            'datasetTag': '',
            'test': False,
            'watsonMode': False,
            'batchSize': config.train_num_batch_size
        }))
예제 #59
0
import json
import logging
import time
from functools import wraps

import requests
from bs4 import BeautifulSoup

from utils.log import getLogger

logger = getLogger('request', logging.INFO)

session = requests.Session()
session.headers[
    'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'

# functions


def lauchRequest(req_func, ret_func, sleep_time):
    def func(*args, sleep_time=sleep_time):
        time.sleep(sleep_time)
        rsp = req_func(*args)

        logger.debug('Return {} from {} {}'.format(rsp.status_code,
                                                   rsp.request.method,
                                                   rsp.url))

        return ret_func(rsp), rsp

    return func