def parse(self): sql = "select id,name from test.xiecheng_landmark " result_set = self.db_conn.QueryDict(sql) logging.info(sql) logging.info("result_set len: %d" % len(result_set)) for row in result_set: origin_keyword = self.ToString(row['name']) logging.info("process id: %d, name: %s" % (row['id'], row['name'])) format_keyword = self._rtrim(origin_keyword) format_keyword = self._replace(format_keyword) # format_keyword = self._full_match(format_keyword) (city, keyword) = self._ltrimcity(format_keyword) db_dict = {} if city: db_dict['city'] = city if len(keyword) < 5: db_dict['kxflag'] = 'invalid' db_dict['reason'] = 'too short' db_dict['format_name'] = keyword self._search_landmark(db_dict) self.db_conn.ExecuteUpdateDict('test.xiecheng_landmark', db_dict, {'id': row['id']}) if __name__ == '__main__': btlog_init('encode.log', console=True, logfile=True) e = XiechengProcessor() e.parse()
sql = ("select distinct i.hotelid from tmp_hotel_info i, tmp_hotel_price p " " where p.hotelid = i.hotelid order by hotelid asc") hotel_result_set = self.product_conn.Query(sql) logging.info("there are total hotel in product db: %d" % len(hotel_result_set)) (only_exists_in_mapping, both_exists, only_exists_in_product) = self.DiffList(mapping_result_set, hotel_result_set) logging.info("only exists in mapping: %d" % len(only_exists_in_mapping)) logging.info("both exists: %d" % len(both_exists)) logging.info("only exists in product: %d" % len(only_exists_in_product)) tmp_list = [str(i[0]) for i in only_exists_in_product] self.SaveList(filename, tmp_list) def Run(self, filename="new_product_hotelid.txt"): self.NewHotel(filename) file_list = self.LoadList(filename) new_list = [] for i in file_list: if len(i) > 0: new_list.append(i) self.GenerateHotelInfoAdd(new_list) if __name__ == '__main__': btlog_init('log/log_hotel_info_add.log', logfile=False, console=True) k = HotelInfoAdd() k.Run()
for row in result_set: path_list = row['ext_path'].split(u'|') name_list = [] for path in path_list: name_list.append(zz6_info_dict[int(path)]) zz6_name_dict[row['id']] = name_list for sid, name_list in zz6_name_dict.iteritems(): flag = False for city_id, city_name_list in path_name_dict.iteritems(): b = self.name_math(name_list, city_name_list) if b: flag = True sql = "update zz6_info_new set ext_city_id = '%s' where id = %s" % (self.ToString(city_id), self.ToString(sid)) logging.info(sql) self.db_conn.Execute(sql) break if not flag: logging.info("invalid sid: %s" % self.ToString(sid)) def run(self): self.calculate_path_level() if __name__ == '__main__': btlog_init('log_format.log', logfile=True, console=False) d = Zz6Formater() # d.run() # d.stat() d.calculate_city_id()
(self.opt, others) = parser.parse_args() self.db_conn = MySQLOperator() if not self.db_conn.Connect(**DB_CONF): logging.error("db error") sys.exit() def run(self): if self.IsRunning('== running =='): return self.WritePidFile() process_pool = [] for site in ('hidemyass', 'freeproxylists', 'free_proxy_list'): process_pool.append( Process(target=do_verify, args=(site, 'good')) ) process_pool.append( Process(target=do_verify, args=(site, 'moderate')) ) process_pool.append( Process(target=do_verify, args=(site, None)) ) for process in process_pool: process.start() for process in process_pool: process.join() if __name__ == '__main__': btlog_init('log_manager.log', logfile=True, console=True, level='DEBUG') v = VerifierManager() v.run()
except Exception, e: logging.warn("e: %s" % str(e)) logging.warn("traceback: %s" % traceback.print_exc()) return ua_list return ua_list def do_url(self, url): logging.debug("url: %s" % self.ToString(url)) html_data = self._crawl_url(url) if len(html_data) == 0: return return self._parse_html(html_data) def run(self): url_list = [ "http://www.useragentstring.com/pages/Chrome/", "http://www.useragentstring.com/pages/Internet%20Explorer/", "http://www.useragentstring.com/pages/Firefox/", ] total_ua_list = [] for url in url_list: a_list = self.do_url(url) total_ua_list.extend(a_list) self.SaveList('ua_list.txt', total_ua_list) if __name__ == '__main__': btlog_init("log_kuoci_processor.log", logfile=False, console=True, level='DEBUG') p = UAUtil() p.run()
logging.info("AAAA: %s" % sql) if self.opt.commit: self.sem_conn.Execute(sql) else: logging.warn("SKIP keywordid: %d, adgroupname: %s" % (info_dict['keywordid'], info_dict['adgroupname'])) ''' if self.opt.commit: for keywordid in only_exists_in_mapping: sql = "delete from %s where kwid = %d" % (self.KEYWORD_SERVLET, keywordid) self.sem_conn.Execute(sql) ''' def Run(self): if self.opt.account: self.DoAccount(self.opt.account) if self.opt.full: for account in ("1", "30", "32"): self.DoAccount(account) def test(self): l1 = [1,2,3,4,5,6,7,8,9] l2 = [4,6,8,9, 200, 201] print self._Diff(l1, l2) if __name__ == '__main__': btlog_init('log/log_keyword_hotel_relation.log', logfile=True, console=True) k = KeywordHotelRelation() k.Run()
def do_hidemyass(self): url_list = [ "http://proxylist.hidemyass.com/1", "http://proxylist.hidemyass.com/2", "http://proxylist.hidemyass.com/3", "http://proxylist.hidemyass.com/4", "http://proxylist.hidemyass.com/5", "http://proxylist.hidemyass.com/6", "http://proxylist.hidemyass.com/7", "http://proxylist.hidemyass.com/8", "http://proxylist.hidemyass.com/9", "http://proxylist.hidemyass.com/10", ] for url in url_list: proxy_list = self.do_url(url, self._parse_hidemyass) logging.info("count: %d for url: %s" % (len(proxy_list), url)) for proxy in proxy_list: logging.info("proxy: %s" % str(proxy)) proxy['kxflag'] = '' proxy['create_time'] = datetime.now() self.db_conn.Upsert('proxy_hidemyass', proxy, ['ip', 'port']) def run(self): self.do_hidemyass() if __name__ == '__main__': btlog_init('log_download.log', logfile=True, console=True, level='DEBUG') d = ProxyDownloader() d.run()
items = line.split("\t") if len(items) != 2: print line, items raise Exception return items class CsvProcessor(EncodeChinese): def line_items(self, line): items = line.split(",") if len(items) != 2: print line, items raise Exception return items def test(): c = CsvProcessor() for line in ('1,http://a.b.c/-北京-jiudian', '2,http://b.c.d/-a-jiudian', '3,http://a.b.c/-%E5%8C%97%E4%BA%AC-jiudian'): print c.do_line(line) def usage(): print 'useage: %s filename gbk|utf8' % sys.argv[0] sys.exit() if __name__ == '__main__': if len(sys.argv) != 3 or sys.argv[2] not in ['gbk', 'utf8']: usage() btlog_init('log/encode.log') e = CsvProcessor() e.run(sys.argv[1])
# start scrapy def do_run(self): # get min(task_date) sql = "select task_date from baidu_keyword_manager where flag = 'init' order by task_date asc limit 1" result_set = self.task_db_conn.Query(sql) if not result_set or len(result_set) == 0: logging.info("no task") return task_date = result_set[0][0] cmd = "/usr/local/bin/scrapy crawl baidu_keyword -a task_date=%s --logfile=log_scrapy.baidu_keyword.%s.log" % ( task_date, task_date, ) logging.info("cmd: %s" % cmd) subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).communicate()[0].strip() def run(self): if self.IsRunning(): logging.info("== is running ==") return self.WritePidFile() self.do_run() if __name__ == "__main__": btlog_init("log_baidu_keyword_scrapy_manager.log", logfile=True, console=True) t = SeoCoreKeywordManager() t.run()
"http://hidemyass.com/proxy-list/5", "http://hidemyass.com/proxy-list/6", "http://hidemyass.com/proxy-list/7", "http://hidemyass.com/proxy-list/8", "http://hidemyass.com/proxy-list/9", "http://hidemyass.com/proxy-list/10", ] for url in url_list: proxy_list = self.do_url(url, self._parse_hidemyass) logging.info("count: %d for url: %s" % (len(proxy_list), url)) for proxy in proxy_list: logging.info("proxy: %s" % str(proxy)) proxy['kxflag'] = '' proxy['create_time'] = datetime.now() self.db_conn.Upsert('proxy_hidemyass', proxy, ['ip', 'port']) def run(self): self.do_hidemyass() def test(self): c = self.LoadFile("cache/8024ef3ca080b74ab57cb5ef36562e5d.html") p = self._parse_hidemyass(c) print p if __name__ == '__main__': btlog_init('log_download.log', logfile=True, console=True) d = ProxyDownloader() d.run()
self.conn_57.Execute("update viator_destination_attraction set destination=%s,destinationid=%s where id=%s", [destination,destinationid,row['id']]) def tmp2(self): sql = "select distinct anchor_text,dgroup,dgroupid from viator_group " self.group_res = self.conn_57.QueryDict(sql) sql = "select destination,destinationid from viator_attraction_city where pid=0 " tmp_res = self.conn_57.QueryDict(sql) for destination_info in tmp_res: for group_info in self.group_res: new_url = "http://www.viator.com/%s-tours/%s/%s-%s" % (destination_info['destination'], group_info['dgroup'], destination_info['destinationid'], group_info['dgroupid']) sql = "insert into viator_ttd_group(destination,destinationid,dgroup,dgroupid,href,source) values(%s,%s,%s,%s,%s,'proc_gen')" self.conn_57.Execute(sql, [destination_info['destination'],destination_info['destinationid'], group_info['dgroup'],group_info['dgroupid'],new_url]) ''' select distinct anchor_text,dgroup,href_key from viator_group ''' if __name__ == '__main__': btlog_init('log_tool.log', logfile=False, console=True) k = ToolTest() # k.test() # k.do_format() # k.check2() k.tmp2()
from datetime import datetime from scrapy.selector import HtmlXPathSelector sys.path.append('/home/yangrq/projects/pycore') from utils.common_handler import CommonHandler from utils.btlog import btlog_init from db.mysqlv6 import MySQLOperator from baidu_common import BaiduCommon from config import * class Tool(CommonHandler): def __init__(self): self.db_conn = MySQLOperator() if not self.db_conn.Connect(**DB_CONF): raise Exception, "db error" def run(self): for flag in ('good', 'moderate', ''): for table in ('proxy_free_proxy_list', 'proxy_freeproxylists', 'proxy_hidemyass'): sql = "select count(*) from %s where kxflag = '%s' " % (table, flag) result_set = self.db_conn.Query(sql) logging.info(sql) logging.info("count: %d" % result_set[0][0]) if __name__ == '__main__': btlog_init(logfile=False, console=True, level='DEBUG') v = Tool() v.run()
url = BaiduCommon.random_request() print url html = urllib2.urlopen(url, timeout=3).read() if len(html) > 100: parse_dict = BaiduCommon.parse(html) if parse_dict['valid_flag']: succeed_count += 1 except Exception, e: logging.warn("error: %s" % str(e)) kxflag = '' if succeed_count == 0: kxflag = 'bad' elif succeed_count < 6: kxflag = 'pool' elif succeed_count < 9: kxflag = 'moderate' else: kxflag = 'good' logging.info("proxy: %s, succeed_count: %d" % (tmp_proxy, succeed_count)) return kxflag def test(self): proxy_list = ['http://140.120.94.26:8088', 'http://181.208.70.75:8080'] for proxy in proxy_list: self._real_verify(proxy) if __name__ == '__main__': btlog_init('log_verifier.log', logfile=True, console=True, level='DEBUG') v = ProxyVerifier() v.test()
FILE_PATH = os.path.dirname(__file__) sys.path.append('/home/yangrq/github/pycore') from utils.btlog import btlog_init from utils.common_handler import CommonHandler from utils.http_client import HttpClient class Tester(CommonHandler, HttpClient): def __init__(self): HttpClient.__init__(self) self.key = urllib.quote_plus('1d1180f3c7c41fd1760c5819fad8b4ed') pass def fetch(self): ret = self.DoGet('storage.service.kuxun.cn', 80, '/storage/fetch-item?key=%s' % self.key) print ret def store(self): data = self.LoadFile('test.py') ret = self.DoPost('storage.service.kuxun.cn', 80, '/storage/store-item?key=%s' % self.key, data) print ret def Run(self): self.store() if __name__ == '__main__': btlog_init('a.log', console=True, logfile=False, level=logging.DEBUG) a = Tester() a.Run() # a.fetch()