Пример #1
0
def getText(nodelist):
    rc = ""
    for node in nodelist:
        try:
            rc += node.data.encode("utf8")
        except Exception, msg:
            getLogger().error(msg)
Пример #2
0
def handleStringfilter(org_string, filter):

	ret_text = org_string

	try:
		if filter.startswith("****"):
			sep = filter[4:].strip()
			if org_string.count(sep) > 0:
				ret_text = org_string[:org_string.rfind(sep)]
		elif filter.endswith("****"):
			sep = filter[:-4].strip()
			if org_string.count(sep) > 0:
				ret_text = org_string[org_string.find(sep)+len(sep):].strip()
		elif filter.count("****") > 0:
			pieces = filter.split("****")
			s_cur = org_string.find(pieces[0].strip())
			e_cur = org_string.rfind(pieces[1].strip())
			if s_cur >=0 and e_cur > s_cur:
				ret_text = org_string[s_cur+len(pieces[0].strip()):e_cur]
		else:
			del_pieces = filter.split(";")
			for piece in del_pieces:
				org_string = org_string.replace(piece, "")
			ret_text = org_string
	except Exception, msg:
		getLogger().error(msg)
Пример #3
0
def getRssInfo(rss):
	downLoader = Downloader()
	try:
		(t_url, header, html) = downLoader.open(rss) 
		print "download ", rss
	except Exception, msg:
		getLogger().error("feed download error : %s %s", msg, rss)
		return None
Пример #4
0
    def __init__(self):
        self.commands = CommandManager()
        self.event_manager = EventManager()
        self.logger = getLogger("Manager")
        self.plugman = PluginManager(self)
        self.yapsy_logger = getLogger("yapsy")

        self.metrics = None
Пример #5
0
    def __init__(self):
        self.commands = CommandManager()
        self.event_manager = EventManager()
        self.logger = getLogger("Manager")
        self.plugman = PluginManager(self)
        self.yapsy_logger = getLogger("yapsy")

        self.metrics = None
Пример #6
0
    def parse(self, contents, temp):
        # resultReturn
        result_dict = dict()

        for field in ["title", "link", "image", "generator", "language", "description", "writer"]:
            result_dict[field] = ""

        try:
            self.dom = xml.dom.minidom.parseString(contents)
            self.title = getText(self.dom.getElementsByTagName("title")[0].childNodes)

            result_dict["title"] = self.title

            if len(self.dom.getElementsByTagName("link")) > 0:
                result_dict["link"] = getText(self.dom.getElementsByTagName("link")[0].childNodes).strip()

            if len(self.dom.getElementsByTagName("image")) > 0:
                result_dict["image"] = getText(
                    self.dom.getElementsByTagName("image")[0].getElementsByTagName("url")[0].childNodes
                )

            if len(self.dom.getElementsByTagName("generator")) > 0:
                result_dict["generator"] = getText(self.dom.getElementsByTagName("generator")[0].childNodes)

            if result_dict["generator"].find("wordpress") >= 0:
                return self.parseWordPress(contents)
            if result_dict["generator"].lower().find("blogger") >= 0:
                return self.parseBlogspot(contents)

            if len(self.dom.getElementsByTagName("language")) > 0:
                result_dict["language"] = getText(self.dom.getElementsByTagName("language")[0].childNodes)
            if len(self.dom.getElementsByTagName("description")) > 0:
                result_dict["description"] = getText(self.dom.getElementsByTagName("description")[0].childNodes)
            try:

                if len(self.dom.getElementsByTagName("managingEditor")) > 0:
                    result_dict["writer"] = getText(self.dom.getElementsByTagName("managingEditor")[0].childNodes)
                elif len(self.dom.getElementsByTagName("webMaster")) > 0:
                    result_dict["writer"] = getText(self.dom.getElementsByTagName("webMaster")[0].childNodes)
                else:
                    tt_list = self.dom.getElementsByTagName("author")

                    try:
                        if len(tt_list) > 0:
                            writer = getText(tt_list[0].getElementsByTagName("name")[0].childNodes)
                            if writer != "":
                                result_dict["writer"] = writer
                                tt_node = self.dom.getElementsByTagName("author")[0].getElementsByTagName("gd:image")[0]
                                image = tt_node.attributes["src"].value.encode("utf8")
                                if image != "":
                                    result_dict["image"] = image
                    except Exception, msg:
                        pass
            except Exception, msg:
                getLogger().error(msg)

        except Exception, msg:
            getLogger().error(msg)
Пример #7
0
def getTistoryId(url):
	downLoader = Downloader()
	attr_dict = dict()
	attr_dict["tid"] = "livere_blogurl = '****.tistory.com';"
	attr_dict["tid2"] = """__addParam("author","****");"""
	try:
		(t_url, header, html) = downLoader.open(url) 
		print "download", url
	except Exception, msg:
		getLogger().error("feed download error : %s %s", msg, rss)
		return None
Пример #8
0
def getDBConnection(host, usr, pwd, db, cursor_type="normal"):
	db_connect = None
	db_cursor = None
	try:
		if cursor_type == "dict":
			db_connect = MySQLdb.connect(host, usr, pwd, db, cursorclass=MySQLdb.cursors.DictCursor)
		else:
			db_connect = MySQLdb.connect(host, usr, pwd, db)
		db_connect.set_character_set('utf8')
		db_cursor = db_connect.cursor()
	except Exception, msg:
		getLogger().error("getDBCursor() Failed : %s"%msg)
Пример #9
0
def getDBConnection(host, usr, pwd, db, cursor_type="normal"):
    db_connect = None
    db_cursor = None
    try:
        if cursor_type == "dict":
            db_connect = MySQLdb.connect(
                host, usr, pwd, db, cursorclass=MySQLdb.cursors.DictCursor)
        else:
            db_connect = MySQLdb.connect(host, usr, pwd, db)
        db_connect.set_character_set('utf8')
        db_cursor = db_connect.cursor()
    except Exception, msg:
        getLogger().error("getDBCursor() Failed : %s" % msg)
Пример #10
0
def makeOutputDict(document_data):

    try:

        if document_data.parsing_result:
            result_dict = document_data.parsing_result
        else:
            result_dict = dict()

        result_dict["mode"] = document_data.mode
        result_dict["type"] = document_data.type
        result_dict["guid"] = document_data.guid
        result_dict["crawlTime"] = document_data.crawl_time

        if "body" in result_dict:
            result_dict["body"] = result_dict["body"].replace("]", "]")
        if "bodyHtml" in result_dict:
            result_dict["bodyHtml"] = result_dict["bodyHtml"].replace("]", "]")
        if "title" in result_dict:
            result_dict["title"] = " ".join(result_dict["title"].replace("]", "]").split())

        result_dict["webLink"] = document_data.down_url
        result_dict["mobileLink"] = document_data.mobile_url

        if document_data.type == "NEWS":
            result_dict["channelName"] = document_data.domain_data.name
            result_dict["channelIdentifier"] = document_data.domain_data.url
            result_dict["sourceType"] = 4
            try:
                image_data = document_data.image_data
                if "78x78" in image_data:
                    result_dict["imageThumbnail78x78"] = image_data["78x78"]
                if "126x126" in image_data:
                    result_dict["imageThumbnail126x126"] = image_data["126x126"]
                if "signature" in image_data:
                    result_dict["imageThumbnailSignature"] = image_data["signature"]
            except Exception, msg:
                getLogger().error(msg)
        elif document_data.type == "BBS":
            for int_field in ["readCount", "replyCount", "recommendCount", "videoCount", "imageCount"]:
                if int_field not in result_dict:
                    result_dict[int_field] = 0
            result_dict["siteName"] = document_data.domain_data.name
            result_dict["siteIdentifier"] = document_data.domain_data.url
            outLinks = list()
            for link in result_dict["bodyLinks"]:
                l_data = result_dict["bodyLinks"][link]
                outLinks.append("%s\t%s" % (link, l_data.text))
            result_dict["outLinks"] = "\n".join(outLinks)
Пример #11
0
    def sendData(self, data, sc="NEWS"):
        try:

            if sc != "NEWS":
                return "NO SERVICE"

            res = self.producer.send_messages(sc, data)
            if str(res).find("error=0") >= 0:
                return "OK"
            else:
                getLogger().error(str(res))
                return "ERROR"
        except Exception, msg:
            getLogger().error(msg)
            return "ERROR"
Пример #12
0
	def __init__(self, _outputDirPath=os.getcwd(), _documentCountLimit=1000):
		self.fieldListDic = dict()
		self.outputDirPath = _outputDirPath
		self.documentCountLimit = _documentCountLimit
		self.logger = getLogger()
		self.setupOutputDir()
		self.initSCFieldListDic()
Пример #13
0
    def __init__(self):
        self.log = getLogger("Updates")
        self.current = current
        self.current_v = StrictVersion(current)

        self.load_release()
        self.do_warnings()
Пример #14
0
 def __init__(self, _outputDirPath=os.getcwd(), _documentCountLimit=1000):
     self.fieldListDic = dict()
     self.outputDirPath = _outputDirPath
     self.documentCountLimit = _documentCountLimit
     self.logger = getLogger()
     self.setupOutputDir()
     self.initSCFieldListDic()
Пример #15
0
	def _getRules(self, url, verbose=False):
		"""
		Returns the RobotTextRules object for url(site-level or dir-level)
		First:  use internal cache
		Second: use memcache
		Third:  download robots.txt and parsing 
		"""
		logger = log.getLogger()

		# 1. use stored robots dictionary cache
		robots_site_path = urlparse.urljoin(url, "/robots.txt")	# Then the site-level
		if robots_site_path in self.robots:
			if verbose:
				logger.info("robotstxt in local memory: %s", robots_site_path)
			return self.robots[robots_site_path]
		
		# 2. use memcache
		rules = None
		try:
			# 3. download robots text
			rules = self._parsingRobotsFile(robots_site_path)	# First try site-level
			if verbose:
				logger.info("robotstxt downloaded: %s: %s", rules.return_code, robots_site_path)
			self.robots[robots_site_path] = rules

		except:
			pass

		return rules
Пример #16
0
def isOldImage(chk_api, hash_key):

	try:
		cmd = "curl %s --connect-timeout 5 --max-time 10 "%chk_api
		exist = True
		fd = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
		for line in fd.stdout.readlines():
			results = line.strip()
			if results.find("404") >= 0:
				exist = False
				break
			if results.find(hash_key.upper()) >= 0:
				getLogger().info("same hash")
				return True
	except Exception, msg:
		getLogger().error(msg)
Пример #17
0
	def parse(self, header, html, url, parser_id=None):

		if self.prm == None:
			self.setRules()

		ret_dict = self.parser.plugParser(header,  html, url)

		result_dict = dict()
		if parser_id != None and parser_id in self.prm.id_dict:
			try:
				host_rule = self.prm.id_dict[parser_id]
				result_dict = self.getDataByRule(host_rule, ret_dict, url)
				result_dict["parser_id"] = parser_id
				return result_dict
			except Exception, msg:
				getLogger().error(msg)
Пример #18
0
def uploadImage(upload_url, file):
	retry = 0
	results = ""
	m_t = time.time()
	while retry < 3:
		try:
			cmd = "curl --upload-file %s %s --connect-timeout 5 --max-time 10 --header 'Expect:' "%(file, upload_url+"/fileext/jpg")
			fd = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
			for line in fd.stdout.readlines():
				results = line.strip()
				if results.startswith("OK"):
					e_t = time.time()
					getLogger().info("%s upload time :  %s", upload_url, e_t - m_t)
					return "OK"
		except Exception, msg:
			getLogger().error(msg)
		retry += 1
Пример #19
0
def downloadImage(img_url):
	rq = mechanize.Request(img_url)
	try:
		rs = mechanize.urlopen(rq)
		http_content = rs.read()
		header = rs.info()
		return http_content
	except Exception, msg:
		try:
			getLogger().error("%s %s", img_url, msg)
			time.sleep(1)
			rs = mechanize.urlopen(rq)
			http_content = rs.read()
			return http_content
		except Exception, msg:
			getLogger().error("%s %s", img_url, msg)
			return str(msg)
Пример #20
0
 def __init__(self):
     self.logger = getLogger("Help")
     self.add_topic(
         AliasListTopic("aliases", HelpTopicType.GenericTopic)
     )
     self.add_topic(
         ComandListTopic("commands", HelpTopicType.GenericTopic)
     )
Пример #21
0
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("YamlConfig")
        # Some sanitizing here to make sure people can't escape the config dirs
        filename = filename.strip("..")
        self.filename = filename
        self.exists = self.reload(False)
Пример #22
0
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("YamlConfig")
        # Some sanitizing here to make sure people can't escape the config dirs
        filename = filename.strip("..")
        self.filename = filename
        self.exists = self.reload(False)
Пример #23
0
    def __init__(self, factory_manager=None,
                 path="./plugins", module="plugins"):
        if factory_manager is None:
            raise ValueError("Factory manager cannot be None!")

        self.log = getLogger("Plugins")

        self.factory_manager = factory_manager

        self.module = module
        self.path = path
Пример #24
0
def makeUrlFromPattern(patterns, ret_key_dict):
	try:
		for ret_url in patterns:
			anypath_str = ""
			for kk in ret_key_dict:
				if (kk in INT_KEY or kk.startswith("INT_")) and  not isInt(ret_key_dict[kk]) :
					break
				if kk == "ANYPATH" and ret_key_dict[kk].strip() == "":
					ret_url = ret_url.replace("/(ANYPATH)/", "/")
				else:
					ret_url = ret_url.replace("("+kk+")", ret_key_dict[kk])
					if kk == "ANYPATH":
						anypath_str = "/"+ret_key_dict[kk]

			if ret_url.count("(") == 0 and ret_url.count(")") == 0:
				ret_url = quote(ret_url.strip(), safe=RESERVED)
				ret_url = ret_url.replace("://m.www.","://m.").replace(anypath_str, "")
				return ret_url
	except Exception, msg:
		getLogger().error(msg)
Пример #25
0
    def writeEachDocumentData(self, f, dataDic, type):

        try:
            if type == "test":
                f.write("%s\t%s\t%s\n" % (dataDic["guid"], dataDic["title"], dataDic["bodyHtml"]))
            else:
                keyList = None
                if "type" in dataDic:
                    scType = dataDic["type"]
                    if scType in self.fieldListDic:
                        keyList = self.fieldListDic[scType]

                if keyList:  # 필드명 지정
                    for field in keyList:
                        if field in dataDic and dataDic[field] and dataDic[field] != "":
                            f.write("		<%s><![CDATA[%s]]></%s>\n" % (field, dataDic[field], field))
                else:  # 필드명 미지정(모두출력)
                    for key, val in dataDic.items():
                        f.write("		<%s><![CDATA[%s]]></%s>\n" % (key, val, key))
        except Exception, msg:
            getLogger().error("%s %s FILE WRITE ERROR", msg, dataDic)
Пример #26
0
    def __init__(self, name, factory, config):
        NoChannelsProtocol.__init__(self, name, factory, config)

        self.log = getLogger(self.name)
        self.event_manager = EventManager()
        self.command_manager = CommandManager()

        reactor.connectTCP(
            self.config["connection"]["host"],
            self.config["connection"]["port"],
            self.factory,
            120
        )
Пример #27
0
    def __init__(self,
                 factory_manager=None,
                 path="./plugins",
                 module="plugins"):
        if factory_manager is None:
            raise ValueError("Factory manager cannot be None!")

        self.log = getLogger("Plugins")

        self.factory_manager = factory_manager

        self.module = module
        self.path = path
Пример #28
0
	def __init__(self, init_dic):
		self.info_dic = dict()
		self.logger = getLogger()
		if self.isValidInfo(init_dic):
			self.info_dic = init_dic
		else:
			self.logger.error("Invalid init information")
			exit(1)
		self.url_parser = URLParser()  # 가장 기본형태의 URL 파서

		self.pattern_dic = {"normal":dict(), "host_key":dict()}  # 모든 패턴정보를 담을 Dictionary
		# dic["normal" | "hostkey"][domain] = [(priority, URLData()), ...]

		self.build_pt_dic = dict()  # 추출해낸 예약어로 다른 원하는 여러가지의 URL을 만들어낼 수 있다.
Пример #29
0
 def __init__(self, protocol_name, config, manager):
     self.logger = getLogger("F: %s" % protocol_name)
     self.config = config
     self.manager = manager
     self.name = protocol_name
     self.ptype = config["main"]["protocol-type"]
     self.protocol_class = None
     self.protocol = None
     manager_config = manager.main_config
     reconnections = manager_config["reconnections"]
     self.r_delay = int(reconnections["delay"])
     self.r_attempts = int(reconnections["attempts"])
     self.r_on_drop = reconnections["on-drop"]
     self.r_on_failure = reconnections["on-failure"]
     self.r_reset = reconnections["reset-on-success"]
Пример #30
0
 def __init__(self, protocol_name, config, manager):
     self.logger = getLogger("F: %s" % protocol_name)
     self.config = config
     self.manager = manager
     self.name = protocol_name
     self.ptype = config["main"]["protocol-type"]
     self.protocol_class = None
     self.protocol = None
     manager_config = manager.main_config
     reconnections = manager_config["reconnections"]
     self.r_delay = int(reconnections["delay"])
     self.r_attempts = int(reconnections["attempts"])
     self.r_on_drop = reconnections["on-drop"]
     self.r_on_failure = reconnections["on-failure"]
     self.r_reset = reconnections["reset-on-success"]
Пример #31
0
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("Data")
        filename = filename.strip("..")

        folders = filename.split("/")
        folders.pop()
        folders = "/".join(folders)

        if not os.path.exists(folders):
            os.makedirs(folders)

        self.filename = filename
        self.reload(False)
Пример #32
0
    def __init__(self, filename):
        self.callbacks = []

        self.logger = getLogger("Data")
        filename = filename.strip("..")

        folders = filename.split("/")
        folders.pop()
        folders = "/".join(folders)

        if not os.path.exists(folders):
            os.makedirs(folders)

        self.filename = filename
        self.reload(False)
Пример #33
0
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("Redis")

        self.path = path
        self.url = kwargs.get("url", None)

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        self.args = args
        self.kwargs = kwargs

        self.reconnect()
Пример #34
0
	def isDisallowSite(self, url, verbose=False):
		"""
		robots.txt가 아래 문구를 포함하면 True를 리턴.
			User-agent: * or zumbot
			Disallow: /
		"""
		logger = log.getLogger()
		self.delay = 3
		robots_site_path = urlparse.urljoin(url, "/robots.txt")	# Then the site-level
		# 3. download robots text
		self.blocked = False
		rules = self._parsingRobotsFile(robots_site_path)	# First try site-level
		if self.blocked:
			return True, self.delay
		else:
			return False, self.delay
Пример #35
0
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("Redis")

        self.path = path
        self.url = kwargs.get("url", None)

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        self.args = args
        self.kwargs = kwargs

        self.reconnect()
Пример #36
0
    def __init__(self):
        if not sys.stdout.isatty() or "--no-console" in sys.argv:
            self.wrapped = False
            return

        self.logger = getLogger("Console")

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        self.wrapper = Wrapper(self)
        self.wrapper_err = WrapperErr(self)
        self.reader = Reader(self)

        # We set it here
        sys.stdout = self.wrapper
        sys.stderr = self.wrapper_err
Пример #37
0
    def __init__(self):
        if not sys.stdout.isatty() or "--no-console" in sys.argv:
            self.wrapped = False
            return

        self.logger = getLogger("Console")

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        self.wrapper = Wrapper(self)
        self.wrapper_err = WrapperErr(self)
        self.reader = Reader(self)

        # We set it here
        sys.stdout = self.wrapper
        sys.stderr = self.wrapper_err
Пример #38
0
    def __init__(self, name, factory, config):
        self.name = name
        self.factory = factory
        self.config = config

        self.received = ""
        self.log = getLogger(self.name)
        self.log.info("Setting up..")

        self.command_manager = CommandManager()
        self.event_manager = EventManager()

        self.username = config["identity"]["username"]
        self.password = config["identity"]["password"]
        self.networking = config["network"]
        self.tokens = config["identity"]["tokens"]

        self.control_chars = config["control_chars"]

        audio_conf = config.get("audio", {})
        self.should_mute_self = audio_conf.get("should_mute_self", True)
        self.should_deafen_self = audio_conf.get("should_deafen_self", True)

        event = general_events.PreConnectEvent(self, config)
        self.event_manager.run_callback("PreConnect", event)

        context = self._get_client_context()
        if context is None:
            # Could not create a context (problem loading cert file)
            self.factory.manager.remove_protocol(self.name)
            return

        reactor.connectSSL(
            self.networking["address"],
            self.networking["port"],
            self.factory,
            context,
            120
        )

        event = general_events.PostConnectEvent(self, config)
        self.event_manager.run_callback("PostConnect", event)
Пример #39
0
    def __init__(self, factory, config):
        self.factory = factory
        self.config = config
        self.log = getLogger("TS3")

        self.log.info("Setting up..")

        self.server = config["server"]
        self.identity = config["identity"]

        self.user = self.identity["username"]
        self.passw = self.identity["password"]
        self.sid = self.server["sid"]

        reactor.connectTCP(
            self.server["address"],
            self.server["port"],
            self.factory,
            120
        )
Пример #40
0
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("DBAPI")

        path = path.replace("//", "/")
        path = path.split("/", 1)[1]

        self.path = path

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        parsed_module = path.split(":", 1)[0]
        self.parsed_module = parsed_module
        self.args = args
        self.kwargs = kwargs

        self.logger.debug(_("Parsed module: %s") % parsed_module)

        self.reconnect()
Пример #41
0
    def __init__(self, path, *args, **kwargs):
        self.callbacks = []

        self.logger = getLogger("DBAPI")

        path = path.replace("//", "/")
        path = path.split("/", 1)[1]

        self.path = path

        self.logger.trace("Path: %s" % path)
        self.logger.trace("Args: %s" % (args or "[]"))
        self.logger.trace("KWArgs: %s" % (kwargs or "{}"))

        parsed_module = path.split(":", 1)[0]
        self.parsed_module = parsed_module
        self.args = args
        self.kwargs = kwargs

        self.logger.debug(_("Parsed module: %s") % parsed_module)

        self.reconnect()
Пример #42
0
    def set_language(self, lang=None, mlang=None):
        if lang is None:
            lang = DEFAULT

        if mlang is None:
            mlang = DEFAULT

        self.get_known()

        if self.log and self.logger is None:
            from utils.log import getLogger

            self.logger = getLogger("Translations")

        if lang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (lang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'" %
                                 (lang, DEFAULT))

            lang = DEFAULT

        if mlang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (mlang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'" %
                                 (mlang, DEFAULT))

            mlang = DEFAULT

        self.language = lang
        self.m_language = mlang
        self.reload()
Пример #43
0
    def set_language(self, lang=None, mlang=None):
        if lang is None:
            lang = DEFAULT

        if mlang is None:
            mlang = DEFAULT

        self.get_known()

        if self.log and self.logger is None:
            from utils.log import getLogger

            self.logger = getLogger("Translations")

        if lang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (lang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'"
                                 % (lang, DEFAULT))

            lang = DEFAULT

        if mlang not in self.known:
            if self.logger is None:
                print "Unknown language '%s', defaulting to '%s'" \
                      % (mlang, DEFAULT)
            else:
                self.logger.warn("Unknown language '%s', defaulting to '%s'"
                                 % (mlang, DEFAULT))

            mlang = DEFAULT

        self.language = lang
        self.m_language = mlang
        self.reload()
Пример #44
0
    def __init__(self):
        self.logger = getLogger("Permissions")

        self.confdir = tmpdir + "/config/"
        self.datadir = tmpdir + "/data/"

        try:
            os.makedirs(self.confdir)
            os.makedirs(self.datadir)
            self.logger.debug("Config and data dirs created.")
        except Exception:
            pass

        yaml.dump({"editor_warning": False},
                  open(self.confdir + "settings.yml", "w"))

        self.storage = StorageManager(self.confdir, self.datadir)

        self.data = self.storage.get_file(self, "data", formats.YAML,
                                          "permissions.yml")

        self.handler = permissionsHandler(self, self.data)

        super(PluginObject, self).__init__()
Пример #45
0
import logging
import os
import pickle
from pprint import pprint

import numpy as np
import pandas as pd
import zerorpc
from sklearn.linear_model import SGDClassifier
from sqlalchemy import update

from model.db import DB_ENGINE, rawcontents
from utils.log import getLogger

logger = getLogger('semiTrain')


def fetchAllData(threshold):
    return pd.read_sql(
        'SELECT rid, tag, assure FROM rawcontents WHERE LENGTH(content) > {}'.
        format(threshold), DB_ENGINE)


def randomSelectData(data, count):
    inds = np.arange(len(data))
    np.random.shuffle(inds)
    return data.iloc[inds[:count]].copy()


def completeTrainData(current):
    stmt = 'SELECT rid, content, vector FROM rawcontents WHERE rid IN {}'
Пример #46
0
    def __init__(self, data_dict):
        self.callbacks = []

        self.logger = getLogger("Data")
        self.data = data_dict
Пример #47
0
    def __init__(self):
        self.log = getLogger("GetchUnix")
        import tty
        import sys

        self.log.trace(_("Loaded: %s, %s") % (tty, sys))
Пример #48
0
from collections import namedtuple

from sqlalchemy import MetaData, create_engine
from sqlalchemy.schema import (Column, ForeignKey, ForeignKeyConstraint, Index,
                               PrimaryKeyConstraint, Table, UniqueConstraint)
from sqlalchemy.types import BLOB, Boolean, DateTime, Float, Integer, String

from settings import DB_ENGINE_FILE, ECHO_DATABASE_INFO
from utils.log import getLogger

logger = getLogger('db')

# models

DB_ENGINE = create_engine('sqlite:///{}'.format(DB_ENGINE_FILE),
                          echo=ECHO_DATABASE_INFO)

metadata = MetaData()

posts = Table(
    'posts', metadata, Column('pid', Integer, autoincrement=True),
    Column('blockid', String, nullable=False, comment='板块id'),
    Column('postid', Integer, nullable=False, comment='帖子id'),
    Column('title', String, nullable=False, comment='帖子标题'),
    Column('pageurl', String, nullable=False, comment='帖子首页url'),
    Column('subType', String, comment='帖子子类型'),
    Column('activityuserid', Integer, nullable=False, comment='楼主id'),
    Column('clickcount', Integer, nullable=False, comment='点击数'),
    Column('replycount', Integer, nullable=False, comment='回复数'),
    Column('remarkcount', Integer, nullable=False, comment='楼主发言数'),
    Column('imgcount', Integer, nullable=False, comment='图片数'),
Пример #49
0
    def __init__(self):
        self.token_regex = re.compile(r"\{[^}]*\}")
        self.parse_regex = re.compile(r"(?<!\\):")
        self.escape_regex = re.compile(r"\\:")

        self.logger = getLogger("Tokens")
Пример #50
0
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from sqlalchemy import select

from model.db import DB_ENGINE, posts, rawcontents
from utils.log import getLogger

logger = getLogger('doc2vec')

# init

with DB_ENGINE.connect() as conn:
    s = select([posts.c.title])
    documents = [
        TaggedDocument(doc, [pid]) for pid, doc in enumerate(conn.execute(s))
    ]

model = Doc2Vec(documents, vector_size=2048, window=5, min_count=1, workers=4)

model.save('RuntimeTY/d2v_2048_5_1216')

model = Doc2Vec.load('RuntimeTY/d2v_2048_5_1216')

logger.critical('Loaded')

with DB_ENGINE.connect() as conn:
    s = select([rawcontents])
    buffer = []

    for row in conn.execute(s):
        rid = row[rawcontents.c.rid]
Пример #51
0
 def __init__(self):
     self.logger = getLogger("Help")
     self.add_topic(AliasListTopic("aliases", HelpTopicType.GenericTopic))
     self.add_topic(ComandListTopic("commands", HelpTopicType.GenericTopic))
Пример #52
0
    def __init__(self):
        self.log = getLogger("GetchWindows")
        import msvcrt

        self.log.trace(_("Loaded: %s") % msvcrt)
Пример #53
0
import logging
import re
from urllib import parse

import demjson

from model.db import Reply
from utils.datetime import parseDatetimeString
from utils.log import getLogger
from utils.request import getSoup, with_max_retries

logger = getLogger('parser', logging.INFO)


class FetchPostFailed(BaseException):
    pass


# app


@with_max_retries(3, 10)
def extractAll(blockid, postid):
    url = urlFactory(blockid, postid, 1)
    soup = getPage(url)
    if soup == None:
        return None

    bbsGlobal = extractBBSGlobal(soup)
    if bbsGlobal['isWenda'] or bbsGlobal['subType'] == '本版隐藏':
        logger.info('Got subtype with {} in {}'.format(bbsGlobal['subType'],
Пример #54
0
from utils.log import getLogger
import pandas as pd
from model.db import DB_ENGINE
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pickle
import os
from sklearn import ensemble, svm

logger = getLogger('predict')

labeled = pd.read_sql(
    'SELECT rid, tag, vector FROM rawcontents WHERE assure>0.5', DB_ENGINE)

train = labeled.groupby('vector')['tag'].mean()

X = list(pd.Series(train.index).apply(pickle.loads))
y = list(train.values)

logger.info('Build the model')

clf = svm.SVC(kernel='rbf')

logger.info('Fitting...')

clf.fit(X, y)

with open(DATA_ROOT / 'SVC_rbf_model.bin', 'wb') as f:
    pickle.dump(clf, f)
Пример #55
0
import pymysql
import configparser
import logging
from argon2 import PasswordHasher
from classes.User import User
from classes.PSAlbum import PSAlbum
from utils.log import getConsoleHandler, getFileHandler, getLogger
import time


psLogger = getLogger(__name__, "logs/photoshare.log")
psLogger.debug("Loading DBConnection class")

class dbConnection:

    USERNAME = ''
    PASSWORD = ''
    HOST = ''
    DATABASE_NAME = ''
    CHARSET = ''
    SQL_CONNECTION = ''


    def __init__(self, settings):
        self.HOST           = settings.get('SQL', 'host')
        self.USERNAME       = settings.get('SQL', 'user')
        self.PASSWORD       = settings.get('SQL', 'password')
        self.DATABASE_NAME  = settings.get('SQL', 'dbName')
        self.CHARSET        = settings.get('SQL', 'charset')

Пример #56
0
from utils.log import getLogger, logging
from utils.request import fetchJson, with_max_retries
import json

logger = getLogger('reward', logging.INFO)


class FetchRewardInfoFailed(BaseException):
    pass


# app


def fetchRewardInfo(bbsGlobal):
    form = {
        'method': 'bbs.api.getArticleDashangInfo',
        'params.item': bbsGlobal['item'],
        'params.articleId': bbsGlobal['artId'],
    }

    for kform, kglob in {
            'params.rewardIds': 'tyfen_rewardIds',
            'params.tyfIds': 'tyfen_tyfIds',
            'params.shangIds': 'shangIds'
    }.items():
        if len(bbsGlobal[kglob]) > 0 and bbsGlobal[kglob][0] == '0':
            form[kform] = bbsGlobal[kglob]
        else:
            form[kform] = '0,' + bbsGlobal[kglob]
Пример #57
0
    def __init__(self, data_dict):
        self.callbacks = []

        self.logger = getLogger("MemoryConfig")
        self.exists = True
        self.data = data_dict
Пример #58
0
Author: Hai Liang Wang <*****@*****.**>
'''

import os
import tensorflow as tf
import shutil
from config import Config
from tqdm import tqdm
from utils import log
from munch import munchify
from models.rnn import Model
from dataset.textdata import TextData
from time import localtime, strftime

config = Config()
logger = log.getLogger(__name__)


def main(unused_argv):
    batch_data = TextData(
        munchify({
            'rootDir': config.root_dir,
            'corpus': config.corpus_name,
            'maxLength': config.train_max_length,
            'maxLengthEnco': config.train_max_length_enco,
            'maxLengthDeco': config.train_max_length_deco,
            'datasetTag': '',
            'test': False,
            'watsonMode': False,
            'batchSize': config.train_num_batch_size
        }))
Пример #59
0
import json
import logging
import time
from functools import wraps

import requests
from bs4 import BeautifulSoup

from utils.log import getLogger

logger = getLogger('request', logging.INFO)

session = requests.Session()
session.headers[
    'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'

# functions


def lauchRequest(req_func, ret_func, sleep_time):
    def func(*args, sleep_time=sleep_time):
        time.sleep(sleep_time)
        rsp = req_func(*args)

        logger.debug('Return {} from {} {}'.format(rsp.status_code,
                                                   rsp.request.method,
                                                   rsp.url))

        return ret_func(rsp), rsp

    return func