def getGuid(url, cursor=None): parsed_url = urlparse.urlparse(url) netloc = parsed_url.netloc if netloc in ["blog.naver.com", "m.blog.naver.com"] or netloc.endswith(".blog.me"): return handleNBUrl(url) elif netloc == "blog.daum.net": return checkDaumPost(parsed_url[2]) elif netloc.endswith(".tistory.com"): return handleTistory(url) else: if cursor == None: try: DBHOST = "10.35.50.116" (db, n_cursor) = getDBCursor(host=DBHOST, user='******', passwd='blogcrawler', db='blogdb') except Exception, msg: return None else:
def getSiteData(url): from test_util import getSiteId from common_func import getDBCursor DBHOST = "bbsdb-mst.s0.crawl.web.search" (db, cursor) = getDBCursor(host=DBHOST, user='******', passwd='bbscrawler', db='bbsdb') id = getSiteId(url, cursor) if id: print id, url query = "SELECT type, page, param FROM url_patterns where site_id = %s" cursor.execute(query, id) results = cursor.fetchall() if results: return makeURLPattern(results) return None, None
def __init__(self, db_cursor=None): self.db_dict = dict() self.db_cursor = None self.db_table = conf.INDIE_BLOG_CHANNEL_TABLE is_my_cursor = False if db_cursor is None : is_my_cursor = True db_con, db_cursor = getDBCursor(host=conf.GUID_GEN_DB_HOST, user=conf.GUID_GEN_DB_USER, passwd=conf.GUID_GEN_DB_PWD, db=conf.GUID_GEN_DB_NAME) self.db_cursor = db_cursor self.blog_fam_list = ["naver","daum","tistory","blogspot","aladin","interpark","dreamwiz","joins","kyobo","chosun","jinbo","ohmynews","moneta","yes24","donga","indie"] self.key_list = ["guid","gen","ourl","curl","vurl","cid","pid","fam","trackback"] self.pt_dic = None # pattern dic (ex. pt_dic["SITE_NAME"]["PATTERN_NAME"]) self.indie_data_dic = dict() self.cid_str = "##CID##" self.pid_str = "##PID##" self.initPatterns() self.initIndieData() if is_my_cursor : self.db_cursor.close()
def setRules(self, db_cursor=None): #db, cursor = getDBCursor(host="10.35.31.3", user="******",passwd="zmfltmxkf", db="domanager") if db_cursor == None: db, cursor = getDBCursor(host="10.35.31.229", user="******",passwd="domanagerA!", db="domanager") else: cursor = db_cursor query = "SELECT a.idx, priority, urlType, scCode, inputPattern, guidPattern, b.name FROM url_patterns a, codes b where a.scCode = b.idx " cursor.execute(query ) results = cursor.fetchall() self.makeRules(results) query = "SELECT urlpatternID, parserID FROM url_pattern_parser" cursor.execute(query ) results = cursor.fetchall() for idx, parser_id in results: self.parser_dict[idx] = parser_id query = "SELECT a.guidPattern, a.buildPattern, b.name FROM url_build_patterns a, codes b where a.buildTypeCode = b.idx " cursor.execute(query ) results = cursor.fetchall() for guid_pattern, build_pattern, name in results: o_netloc, o_key_netloc, o_path, o_params = getParsedUrl(guid_pattern) guid_pattern = makeUrl(o_netloc, o_key_netloc, o_path, o_params) if not guid_pattern in self.url_build_patterns: self.url_build_patterns[guid_pattern] = dict() self.url_build_patterns[guid_pattern][name] = build_pattern if db_cursor == None: cursor.close() db.close()
if __name__ == "__main__": a = UrlFactory() TEST_URLS = ["http://ilwar.com/asdf/123?name=asdf ", "http://ilwar.com/asdf/123/page/123?name=asdf ", "http://www.ilwar.com/asdf/page/1?name=asdf ", "http://ilwar.com/asdf/123/page/1?name=asdf ", "http://ilwar.com/asdf", "http://www.ilbe.com/index.php?mid=ilbe&category=123&document_srl=123123", "http://www.ilbe.com/asdfasd?mid=ilbe&category=123&document_srl=123123", "http://www.ilbe.com/123123", "http://abc.tistory.com/asdf/123123", "http://abc.tistory.com/asdf", "http://abc.tistory.com/123", "http://naver.com/asdfa/asdfa/asdfa/board.php?bo_table=cm_lego&wr_id=1231&dasd=asdf", "http://todayhumor.co.kr/board/list.php?table=gomin", "http://todayhumor.co.kr/board/list.php?table=gomin&page=4" ] import sys if len(sys.argv) > 1: url = sys.argv[1] print url, a.getGuid(url) else: db, cursor = getDBCursor(host="10.35.31.229", user="******",passwd="domanagerA!", db="domanager") query = "SELECT down_url, guid FROM url_test " cursor.execute(query) results = cursor.fetchall() fail_count = 0 for url, guid in results: res = a.getGuid(url) if res: (type, new_guid, other_dict) = res if guid != new_guid: print url, guid, new_guid , "SOMETHING WRONG" fail_count += 1 else: print url, guid, "IT's OK!!" else: print url, "AAAAAAAA"
def makeRule(self, db_cursor=None): #db, cursor = getDBCursor(host="10.35.50.116", user="******",passwd="blogcrawler", db="pado") #db, cursor = getDBCursor(host="10.35.50.116", user="******",passwd="blogcrawler", db="pado") if db_cursor: cursor = db_cursor else: db, cursor = getDBCursor(host="10.35.31.229", user="******",passwd="domanagerA!", db="domanager") code_dict = dict() query = "SELECT idx, name from codes where code_category = 4" cursor.execute(query) results = cursor.fetchall() code_dict = dict() for idx, name in results: code_dict[idx] = name query = "SELECT a.idx, domainID, domain, activeYN, b.idx, field, ruleCode, ruleVal, b.parentRuleID FROM pado.parsers a, pado.parser_rules b where a.idx = b.parserID" cursor.execute(query) results = cursor.fetchall() # http://clien.net/cs2/bbs/board.php?bo_table=park&wr_id=35767059 # http://mlbpark.donga.com/mbs/articleVC.php?mbsC=bullpen2&mbsIdx=1954295 # http://marumaru.in/b/free/76251 # parsing rule type : meta, next_text, class, id, html, class_count children_rules = dict() offset_dict = dict() all_rules = dict() for parser_id, domain_id, host, activeYN, idx, field, rule_code, value, parent_id in results: if self.mode == "service" and activeYN != "Y": continue if rule_code in code_dict: type = code_dict[rule_code] else: type = "" if rule_code == 60: self.template_rules[host] = int(value), parser_id continue if not host in self.rules: self.rules[host] = dict() current_rule = HostParserRule() self.rules[host][parser_id] = current_rule else: if parser_id in self.rules[host]: current_rule = self.rules[host][parser_id] else: current_rule = HostParserRule() self.rules[host][parser_id] = current_rule if parent_id: parent_id = int(parent_id) if type == "offset": if parent_id in current_rule.rules: current_rule.rules[parent_id].offset = value elif parent_id in all_rules: all_rules[parent_id].offset = value else: offset_dict[parent_id] = value elif parent_id in current_rule.rules: current_rule.rules[parent_id].children_rules[idx] = NodeRule(field, type, value) if idx in offset_dict: current_rule.rules[parent_id].children_rules[idx].offset = offset_dict[idx] all_rules[idx] = current_rule.rules[parent_id].children_rules[idx] else: if parent_id not in children_rules: children_rules[parent_id] = dict() children_rules[parent_id][idx] = NodeRule(field, type, value) if idx in offset_dict: children_rules[parent_id][idx].offset = offset_dict[idx] all_rules[idx] = children_rules[parent_id][idx] if type == "delete": self.string_filter[parent_id] = value else: # class[3] if parent_id: pass else: t_rule = NodeRule(field, type, value) all_rules[idx] = t_rule if idx in offset_dict: t_rule.offset = offset_dict[idx] current_rule.rules[idx] = t_rule if idx in children_rules: for c_idx, child_rule in children_rules[idx].items(): if child_rule.type == "offset": current_rule.rules[idx].offset = child_rule.value else: current_rule.rules[idx].children_rules[c_idx] = child_rule self.id_dict[parser_id] = current_rule if db_cursor == None: cursor.close()
def getCursor(self): DBHOST = "10.35.50.116" (self.db, self.cursor) = getDBCursor(host=DBHOST, user='******', passwd='blogcrawler', db='blogdb')