class ServerDatabaseHandler: def __init__(self): try: self.logHandler = LogHandler("ServerDatabaseHandle") self.serverHandler = MySQLdb.connect('127.0.0.1', 'root', 'yourdbpassowrd', 'haberbus'); self.serverHandler.set_character_set('utf8') self.serverHandler.autocommit(True) self.cursor = self.serverHandler.cursor() self.cursor.execute('SET NAMES utf8;') self.cursor.execute('SET CHARACTER SET utf8;') self.cursor.execute('SET character_set_connection=utf8;') except: self.logHandler.logger("__init__") def executeQuery(self, query): self.cursor.execute(query) if query.split()[0].lower() == "select": return self.cursor.fetchall() else: self.serverHandler.commit() def closeConnection(self): self.serverHandler.close()
def __init__(self, view): self.user = None self.view = view # Logger 초기화 logHandler = LogHandler(self.view) logHandler.setLevel(logging.DEBUG) logger.addHandler(logHandler) # 키움증권API OCX Instance 생성 self.kiwoom = QAxWidget("KHOPENAPI.KHOpenAPICtrl.1") # event handler 등록 self.kiwoom.OnEventConnect[int].connect(self.OnEventConnect) self.kiwoom.OnReceiveTrData[str, str, str, str, str, int, str, str, str].connect(self.OnReceiveTrData) self.kiwoom.OnReceiveRealData[str, str, str].connect(self.OnReceiveRealData) self.kiwoom.OnReceiveMsg[str, str, str, str].connect(self.OnReceiveMsg) self.kiwoom.OnReceiveChejanData[str, int, str].connect(self.OnReceiveChejanData) self.kiwoom.OnReceiveRealCondition[str, str, str, str].connect(self.OnReceiveRealCondition) self.kiwoom.OnReceiveTrCondition[str, str, str, int, int].connect(self.OnReceiveTrCondition) self.kiwoom.OnReceiveConditionVer[int, str].connect(self.OnReceiveConditionVer)
def __init__(self): try: self.logHandler = LogHandler("ServerDatabaseHandle") self.serverHandler = MySQLdb.connect('127.0.0.1', 'root', 'yourdbpassowrd', 'haberbus'); self.serverHandler.set_character_set('utf8') self.serverHandler.autocommit(True) self.cursor = self.serverHandler.cursor() self.cursor.execute('SET NAMES utf8;') self.cursor.execute('SET CHARACTER SET utf8;') self.cursor.execute('SET character_set_connection=utf8;') except: self.logHandler.logger("__init__")
import random import time import json import requests from requests.adapters import HTTPAdapter from LogHandler import LogHandler log = LogHandler('downloader', file=False) def get_proxy(): bak_url = 'http://123.207.35.36:5010/get/' url = 'http://127.0.0.1:5010/get' try: proxy = requests.get(url).text except Exception as e: print('本地获取代理失败,远程从获取') proxy = requests.get(bak_url).text return proxy class Downloader(object): def __init__(self, *args, **kwargs): pass @property def user_agent(self): """ return an User-Agent at random :return:
------------------------------------------------- File Name: CheckProxy Description : used for check getFreeProxy.py Author : Tc date: 2018/12/24 ------------------------------------------------- """ __author__ = 'Tc' import sys from getFreeProxy import GetFreeProxy from utilFunction import verifyProxyFormat from LogHandler import LogHandler log = LogHandler('check_proxy', file=False) class CheckProxy(object): @staticmethod def checkAllGetProxyFunc(): """ 检查getFreeProxy所有代理获取函数运行情况 Returns: None """ import inspect member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) proxy_count_dict = dict() for func_name, func in member_list:
class DatabaseHandler: """ database handler class """ def __init__(self, path, log_level=0): """ :param path: path to the db, including db name :param log_level: 0-4, Debug...Critical, see LogHandler class """ self.__path = path self.__tables = {} self.__keywords = {} self.__lh = LogHandler(log_level) def open(self): """ open database """ self.__db = sqlite3.connect(self.__path) self.__dbcursor = self.__db.cursor() self.__lh.info("DB is opened: " + self.__path) def close(self): """ close database """ self.__db.close() self.__lh.info("DB is closed: " + self.__path) def get_table_name(self, name): """ get columns of the selected table :param name: tablen ame :return: list of columns """ self.__dbcursor.execute("PRAGMA table_info('" + name + "')") fetch = self.__dbcursor.fetchall() columns = [] for col in fetch: columns.append(col[1]) self.__lh.debug("Table columns: " + ", ".join(columns)) return columns def set_used_table(self, name): """ set which table is used from database :param name: table name """ self.__tables[name] = self.get_table_name(name) self.__lh.info("New table is set to used: " + name) def release_used_table(self, name): """ release the given table :param name: table name """ del self.__tables[name] self.__lh.info("Table is released from used tables: " + name) def clear_used_tables(self): """ release all used tables """ self.__tables.clear() self.__lh.info("All tables are released from used tables") def execute_query(self, query): """ executes the given sqlite query string :param query: sqlite query string :return: db's response to the query """ try: self.__dbcursor.execute(query) self.__lh.info("Executed query: " + query) except Exception as e: self.__lh.exceptionHandling(e) return self.__dbcursor.fetchall() def commit(self): """ apply changes done to database """ self.__db.commit() def insert(self, table_name, data, columns=[]): """ builds and executes INSERT query :param table_name: used table's name :param data: list of strings, data of a row to be inserted to the given table :param columns: list of columns where the given data should be inserted """ if self.is_registered(table_name, data, columns) == True: self.__lh.warning("Record already inserted") else: try: keyword = "INSERT INTO" keyword2 = "VALUES" cols = "(" + ",".join(columns) + ")" args = "(" + ",".join(data) + ")" query = " ".join([keyword, table_name, cols, keyword2, args ]) + ";" self.execute_query(query) except Exception as e: self.__lh.exceptionHandling(e) def select(self, table_name, cols=[], condition=[], distinct=False): """ builds and executes SELECT query :param table_name: used table's name :param cols: list of columns should be selected :param condition: list of tuples, conditions of the query, use tupleListToStatement function to generate this string list :param distinct: set True if SELECT DISTICT query is required :return: list of tuple list, tuple list contains the selected data of one line """ zipped = [] try: if cols == []: raise Exception("Empty selected columns list") keyword = "SELECT" if distinct == False else "SELECT DISTINCT" if cols == ["*"]: args = "*" else: args = ", ".join(cols) table = "FROM " + table_name if condition != []: where = self.where_clause(condition) else: where = "" query = " ".join([keyword, args, table, where]) + ";" rows = self.execute_query(query) for row in rows: self.__lh.debug("Select raw return values: " + (str(row))) zipped.append(list(zip(cols, row))) line = "" for index in range(0, len(cols)): line = line + str(cols[index]) + " = " + str(row[index]) if index is not (len(cols) - 1): line += ", " self.__lh.debug("Returned line: " + line) self.__lh.debug(zipped) except Exception as e: self.__lh.exceptionHandling(e) finally: return zipped def update(self, table_name, data=[], condition=[]): """ builds and executes UPDATE query :param table_name: used table's name :param data: list of tuple, see tupleListToStatement function :param condition: list of tuple, see whereClause """ keyword = "UPDATE " + table_name + " SET" for i in range(0, len(data)): data[i] = (data[i][0], "=", data[i][1]) d_args = self.tuple_list_to_statement(data) c_args = self.where_clause(condition) query = " ".join([keyword, d_args, c_args]) + ";" self.execute_query(query) def delete(self, table_name, condition=[]): """ builds and executes DELETE query :param table_name: used table's name :param condition: list of tuple, see whereClause """ keyword = "DELETE FROM " + table_name c_args = self.where_clause(condition) query = " ".join([keyword, c_args]) + ";" self.execute_query(query) def where_clause(self, condition=[]): """ builds WHERE clause :param condition: list of tuple, see tupleListToStatement function :return: WHERE clause string """ clause = "WHERE " + self.tuple_list_to_statement(condition) self.__lh.debug("WHERE clause: " + clause) return clause def and_clause(self): # TODO pass def or_clause(self): # TODO pass def like_clause(self): # TODO pass def glob_clause(self): # TODO pass def limit_clause(self): # TODO pass def order_by_clause(self): # TODO pass def group_by_clause(self): # TODO pass def having_clause(self): # TODO pass def tuple_list_to_statement(self, tuple_list=[]): """ builds string from tuple :param tuple_list: tuple list, it should look like this when building conditions: [(column, relation operator, value), (c,ro,v),...] :return: list of strings """ statement = "" for t in tuple_list: tmp = str(t[0]) + " " + str(t[1]) + " " + str(t[2]) if statement == "": statement = tmp else: statement = ",".join([statement, tmp]) return statement def is_registered(self, table_name, data_list=[], col_list=[]): ret = False new_list = [] self.__lh.debug(data_list) self.__lh.debug(col_list) for x in range(len(data_list)): new_list.append((data_list[x], "=", col_list[x])) result = [] #result = self.select(table_name,"*",self.tuple_list_to_statement(new_list)) if result != []: ret = True return ret
def __init__(self, link_queue, data_queue, thread_name, proxies): threading.Thread.__init__(self, name = thread_name) self.log = LogHandler('detail_data_crawler') self.link_queue = link_queue self.data_queue = data_queue self.proxies = proxies
def __init__(self): self.logHandler = LogHandler("Main") self.serverHandler = ServerDatabaseHandler() self.run()
# -*- coding: utf-8 -*- from __future__ import unicode_literals import requests, sys, time sys.path.append('../') from getProxy.manipulateProxy import get, delet from LogHandler import LogHandler reqErro = LogHandler('neterro', stream=False) class request(object): """docstring for request""" def __init__(self, data): super(request, self).__init__() self.data = data def judge(self): if self.data['url'].find('https://www.amazon.com') < 0: self.data['url'] = 'https://www.amazon.com/' + self.data['url'] def cport(self): port = get() return { "http": "http://" + port, "https": "http://" + port, } def webGet(self): try: proxies = []
def __init__(self): from db.DbClient import DbClient self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy'
def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule')
def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict
def get(self, queue_id): log_handler = LogHandler() result = log_handler.GetQueueItemsById(queue_id) return result, 200
def get(self): log_handler = LogHandler() result = LogHandler.GetQueueStats(log_handler) return result, 200
"zh-CN,zh;q=0.8", "Cookie": 'aws-target-static-id=1497838335626-19381; s_vn=1529374335856%26vn%3D1; regStatus=pre-register; aws_lang=cn; aws-target-data=%7B%22support%22%3A%221%22%7D; aws-target-visitor-id=1497838335629-9703.24_4; amznacsleftnav-1e4dfe77-0d78-3527-b54a-f23cc2cb231e=1; AMCVS_4A8581745834114C0A495E2B%40AdobeOrg=1; AMCV_4A8581745834114C0A495E2B%40AdobeOrg=-894706358%7CMCIDTS%7C17415%7CMCMID%7C15071633912452894915662900002460765377%7CMCOPTOUT-1504585615s%7CNONE%7CMCAID%7C2CD68D00852A8569-40000301C00003C7%7CvVersion%7C2.3.0; s_lv=1504578420898; appstore-devportal-locale=zh_CN; s_vnum=1931133517751%26vn%3D4; s_nr=1504749318700-Repeat; s_dslv=1504749318701; s_ppv=85; at-main=Atza|IwEBIDDWvxoMHsifLiTWazK1cSAb6sPs44jFlHu4oL3LodNKVsU8NGdSm5K8FdmMWr__wQDtiusZ88pZrsaXKPN42AIgcztHnSaeYEUHhgy0P3sil04wpCLe2F1m_HK7eZDg0D6bxXBzGOttA8lN4Um0cObimx2j07DNb1KHtgA465FpjPDMqWfHdw_Uvy4vKwfqIzwSp2nO3iqK1VlRDQlcNCHFi33PfESso3Up1yHvkhiqBlVSVl-GSm7a49vnTMX-xGIhPWqDBCSEunEF8nPTZ1Y4zM0RCnQojWjaCiesNq_3iy3PgL-LKEEmxTNe6RixwK3d2Swd8dWjcy743XEdxa4xUeiD9wtrT8Zx0hyWIYejSDm7W0REBeViHrYNw8Pr927I5Vrlm6rN23psIff2--yF; sess-at-main="goK1tVYcgb7R2XcYasQ8/+2ABnZO2hdIYIDDrbHkOiM="; x-wl-uid=1XXT7ohIeJZyY/7sC/ytnTHT+Vwn62xt/Jqr4l2xe+WetZvII2HlOUcCgEXfKRBNwgbOTZrrdaMcZJNwjXVH48GrQ2/ROuA6CLbVJ1xZ9Jo0PgAxPQ9L1NbmGgdG1oBrv+QyImd7kzKk=; skin=noskin; JSESSIONID=5AF6FA51D68211D42C8B1E4F272968D3; s_sess=%20s_ppvl%3D%3B%20s_ppv%3DUS%25253AAZ%25253ASOA-overview-seeall%252C92%252C37%252C3448%252C2880%252C1348%252C1920%252C1080%252C0.67%252CL%3B%20s_cc%3Dtrue%3B%20c_m%3DundefinedTyped/BookmarkedTyped/Bookmarked%3B%20s_sq%3Dacsus-prod%253D%252526pid%25253D508510%252526pidt%25253D1%252526oid%25253Dhttps%2525253A%2525252F%2525252Fwww.amazon.com%2525252Fgp%2525252Fhelp%2525252Fcustomer%2525252Fforums%2525252Fkindleqna%2525252Fref%2525253Dhp_gt_comp_ss_forum_Kindle%252526ot%25253DA%3B; s_pers=%20s_fid%3D0B584A252E26D24B-0508DABA67938BE2%7C1662514482807%3B%20s_dl%3D1%7C1507518928552%3B%20gpv_page%3DUS%253ASC%253A%2520SellerCentralLogin%7C1507518928555%3B%20s_ev15%3D%255B%255B%2527Typed/Bookmarked%2527%252C%25271507517128514%2527%255D%252C%255B%2527Typed/Bookmarked%2527%252C%25271507517128557%2527%255D%255D%7C1665283528557%3B; x-main="Ihat36AUznBqU@bAXSLMRPKxSgCxnH1bEUQiqvtsIVRnH75aplq29jnQqj?LHESj"; lc-main=en_US; session-token="STQf3qhsr7tRS9RXdV+B5iLxVuHXTWxw3sz5jRhYxXQ8OePnWMVSkTiU5Wot7+yBX2/sYgnK2b585EUmLISbef7iNSg9mAtKSxMrAgH3WlNomIsvZw5cciATBtgld6xxTLN0L4qRqPXGW6ucpWuX9M0yI4EWddIswJKbCJK56B27pyn72QR0wPvzVVODwGZRxTxjBywnAJfXhbq/zyaIytgH0BvgASM5FAp6FyMVO7E/dq31m2t0EQY1Jx+jmFu6/mZy03RzzDPWKOZKlpVMSg=="; ubid-main=131-0601013-2724535; session-id-time=2082787201l; session-id=136-1823294-1208816; csm-hit=TC76C8PW39N5MY6AKGWM+s-FTFB6ESPHJ74V4N2XPGW|1507777709001' } csheaders = { "Accept": "application/json, text/javascript, */*; q=0.01", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", "Host": "www.fanzle.com", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "X-Requested-With": "XMLHttpRequest" } keylog = LogHandler('keylog') toplog = LogHandler('toplog') keylog = LogHandler('key') conn = MongoClient(host='127.0.0.1', port=27017) amxpage = conn['keySearch'] class getPage(object): """docstring for getPage""" def __init__(self, url): super(getPage, self).__init__() self.url = url def generateData(self): return {'url': self.url, 'headers': headers, 'timeout': 3, 'ac': None}
def __init__(self): #======================================================================= # Configuration values #======================================================================= self.wwwPath = '/var/www/html/' #Resimler icin self.linkImagePath = self.wwwPath + 'imageslink/' # Url request'lerinde kullanilan agent self.userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" # Alinan ve islenen linkleri goster self.showLink = False #links database tablosu icin son ek self.yearMonth = "" #Turkce karakter self.turkishDict = {'ı': 'i', 'ü': 'u', 'ö':'o', 'ş':'s', 'ç':'c', 'ğ':'g'} #======================================================================= # Configuration For Requests #======================================================================= #Bazi sitelerin haber resimleri icin class ile alinir #birden fazla class girilebilir value kismina , karakteri ile bitisik yaz self.imageClassForSources = { 'http://www.aksam.com.tr':'image', 'http://www.trthaber.com':'image', 'http://www.trthaber.com/haber/kultur-sanat/':'image', 'http://amkspor.sozcu.com.tr':'in_image', 'http://www.sosyalmedya.co':'attachment-large wp-post-image', 'http://www.teknolojioku.com':'newsImage', 'http://www.webrazzi.com': 'post-content', 'http://www.taraf.com.tr/yazarlar/':'info', 'http://www.donanimhaber.com':'entry', 'http://www.bugun.com.tr':'image', 'http://www.milliyet.com.tr/Yazar.aspx?aType=Yazarlar':'image', 'http://www.gazetevatan.com/yazarlar/':'aimg', 'http://www.cumhuriyet.com.tr/yazarlar':'author', 'http://www.sabah.com.tr/Yazarlar':'iBox', 'http://www.ensonhaber.com':'mansetresmi', 'http://www.yenisafak.com.tr/yazarlar/':'picture', 'http://www.haberler.com':'image', 'http://www.internethaber.com':'item img active', 'http://haber.sol.org.tr':'singlenews-image', 'http://haber.sol.org.tr':'singlenews-image', 'http://www.yeniakit.com.tr/yazarlar':'au-top-right', 'http://www.takvim.com.tr':'haberImg', 'http://www.haberturk.com':'image', 'http://www.haber7.com':'image_src', 'http://www.mynet.com':'twitter:image', } #Farkli meta etiketi icin tanim self.descMetaTypes = { 'http://www.radikal.com.tr/yazarlar':{"name":"twitter:description"}, 'http://www.radikal.com.tr/kultur':{"name":"twitter:description"}, 'http://webtv.radikal.com.tr':{"name":"twitter:description"}, 'http://www.radikal.com.tr':{"name":"twitter:description"}, 'http://www.mynet.com/teknoloji':{"property":"og:description"}, 'http://www.mynet.com':{"property":"og:description"}, 'http://webtv.hurriyet.com.tr':{"property":"og:description"}, 'http://www.haber7.com':{"name":"twitter:description"}, 'http://www.samanyoluhaber.com':{"property":"og:description"}, } #Desc'i alma title'i ata self.notGetDesc = [] #Link'lerdeki ? karakteri icin self.containQuestionCharacter = [ 'http://www.posta.com.tr', 'http://www.odatv.com', ] #Link sonundaki / karakteri silinmeyecek self.notDeletedBackslashCharacter = [ 'http://www.posta.com.tr', 'http://www.odatv.com', 'http://www.taraf.com.tr', 'http://www.taraf.com.tr/yazarlar/', 'http://www.samanyoluhaber.com', 'http://amkspor.sozcu.com.tr', 'http://www.haberler.com', 'http://www.diken.com.tr', 'http://www.fizikist.com', ] #Haberden alinan resim c*k kucuk ise yada yanlis ise title baz alinarak google'dan resim cekilir #Google image limit'leri oldugundan surekli cekince yasaklandi self.getGoogleImageList = [] #Link'in image link'inin icinde alindiktan sonra replace edilmesi gerek bir sey var ise self.replaceStringForLink = { 'shiftdelete.net': ('/shiftdelete.net', '/s01.shiftdelete.net/img/general_b'), } #Asagiya girilen keyword'lar link icinde geciyor ise o link alinmaz self.notGetLinkIfContainThisKeyword = ['javascript:', ] #Hotlink korumasi olanlar bunlarin resimleri link vermek yerine indiriliyor self.hotlinks = ['ensonhaber.com', 'bugun.com.tr', 'internethaber.com', 'haberler.com', 'odatv.com', 'cumhuriyet.com.tr', 'zaman.com.tr', 'donanimhaber.com', 'ajansspor.com', 'haber.sol.org.tr'] #Eger karakterler bozuk geliyor ise Request tpye self.requestTypes = ['amkspor.sozcu.com.tr', 'yenisafak.com.tr', 'indir.com'] #Eger source'u encoding geliyor ise baska bir request yapilir self.encodePageSource = ['mynet.com', 'trthaber.com'] #degeri 0 ise www eklenir , 1 ise www silinir, 2 ise http://www silinir self.getTweetCountFix = {'yenisafak.com.tr':0, 'bigumigu.com':1, 'webrazzi.com':2, 'odatv.com':2} #Eger title, desc karakterler bozuk geliyor temizlik icin self.contentTitleDescReplace = ['zaman.com.tr', 'shiftdelete.net'] #Eger link'lerde , var ise facebook share count almak icin digeri kullanisin self.linkContentComma = ['t24.com.tr', 'ntv.com.tr'] #Gormek istemedigin linkleri gec #imageLink'i icinde bulunanlar sadece self.blackListLinkImage ={ } #======================================================================= # Class import self.logHandler = LogHandler("Main") # Database baglantisi self.serverHandler = ServerDatabaseHandler() # Local ve Server calisma ayrimi self.getLinkCountLimit = 1500 if len(sys.argv) == 2: category = sys.argv[1] self.run(category) elif len(sys.argv) == 3: category = sys.argv[1] source = sys.argv[2] self.run(category, source) else: self.run()
class Main: def __init__(self): #======================================================================= # Configuration values #======================================================================= self.wwwPath = '/var/www/html/' #Resimler icin self.linkImagePath = self.wwwPath + 'imageslink/' # Url request'lerinde kullanilan agent self.userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" # Alinan ve islenen linkleri goster self.showLink = False #links database tablosu icin son ek self.yearMonth = "" #Turkce karakter self.turkishDict = {'ı': 'i', 'ü': 'u', 'ö':'o', 'ş':'s', 'ç':'c', 'ğ':'g'} #======================================================================= # Configuration For Requests #======================================================================= #Bazi sitelerin haber resimleri icin class ile alinir #birden fazla class girilebilir value kismina , karakteri ile bitisik yaz self.imageClassForSources = { 'http://www.aksam.com.tr':'image', 'http://www.trthaber.com':'image', 'http://www.trthaber.com/haber/kultur-sanat/':'image', 'http://amkspor.sozcu.com.tr':'in_image', 'http://www.sosyalmedya.co':'attachment-large wp-post-image', 'http://www.teknolojioku.com':'newsImage', 'http://www.webrazzi.com': 'post-content', 'http://www.taraf.com.tr/yazarlar/':'info', 'http://www.donanimhaber.com':'entry', 'http://www.bugun.com.tr':'image', 'http://www.milliyet.com.tr/Yazar.aspx?aType=Yazarlar':'image', 'http://www.gazetevatan.com/yazarlar/':'aimg', 'http://www.cumhuriyet.com.tr/yazarlar':'author', 'http://www.sabah.com.tr/Yazarlar':'iBox', 'http://www.ensonhaber.com':'mansetresmi', 'http://www.yenisafak.com.tr/yazarlar/':'picture', 'http://www.haberler.com':'image', 'http://www.internethaber.com':'item img active', 'http://haber.sol.org.tr':'singlenews-image', 'http://haber.sol.org.tr':'singlenews-image', 'http://www.yeniakit.com.tr/yazarlar':'au-top-right', 'http://www.takvim.com.tr':'haberImg', 'http://www.haberturk.com':'image', 'http://www.haber7.com':'image_src', 'http://www.mynet.com':'twitter:image', } #Farkli meta etiketi icin tanim self.descMetaTypes = { 'http://www.radikal.com.tr/yazarlar':{"name":"twitter:description"}, 'http://www.radikal.com.tr/kultur':{"name":"twitter:description"}, 'http://webtv.radikal.com.tr':{"name":"twitter:description"}, 'http://www.radikal.com.tr':{"name":"twitter:description"}, 'http://www.mynet.com/teknoloji':{"property":"og:description"}, 'http://www.mynet.com':{"property":"og:description"}, 'http://webtv.hurriyet.com.tr':{"property":"og:description"}, 'http://www.haber7.com':{"name":"twitter:description"}, 'http://www.samanyoluhaber.com':{"property":"og:description"}, } #Desc'i alma title'i ata self.notGetDesc = [] #Link'lerdeki ? karakteri icin self.containQuestionCharacter = [ 'http://www.posta.com.tr', 'http://www.odatv.com', ] #Link sonundaki / karakteri silinmeyecek self.notDeletedBackslashCharacter = [ 'http://www.posta.com.tr', 'http://www.odatv.com', 'http://www.taraf.com.tr', 'http://www.taraf.com.tr/yazarlar/', 'http://www.samanyoluhaber.com', 'http://amkspor.sozcu.com.tr', 'http://www.haberler.com', 'http://www.diken.com.tr', 'http://www.fizikist.com', ] #Haberden alinan resim c*k kucuk ise yada yanlis ise title baz alinarak google'dan resim cekilir #Google image limit'leri oldugundan surekli cekince yasaklandi self.getGoogleImageList = [] #Link'in image link'inin icinde alindiktan sonra replace edilmesi gerek bir sey var ise self.replaceStringForLink = { 'shiftdelete.net': ('/shiftdelete.net', '/s01.shiftdelete.net/img/general_b'), } #Asagiya girilen keyword'lar link icinde geciyor ise o link alinmaz self.notGetLinkIfContainThisKeyword = ['javascript:', ] #Hotlink korumasi olanlar bunlarin resimleri link vermek yerine indiriliyor self.hotlinks = ['ensonhaber.com', 'bugun.com.tr', 'internethaber.com', 'haberler.com', 'odatv.com', 'cumhuriyet.com.tr', 'zaman.com.tr', 'donanimhaber.com', 'ajansspor.com', 'haber.sol.org.tr'] #Eger karakterler bozuk geliyor ise Request tpye self.requestTypes = ['amkspor.sozcu.com.tr', 'yenisafak.com.tr', 'indir.com'] #Eger source'u encoding geliyor ise baska bir request yapilir self.encodePageSource = ['mynet.com', 'trthaber.com'] #degeri 0 ise www eklenir , 1 ise www silinir, 2 ise http://www silinir self.getTweetCountFix = {'yenisafak.com.tr':0, 'bigumigu.com':1, 'webrazzi.com':2, 'odatv.com':2} #Eger title, desc karakterler bozuk geliyor temizlik icin self.contentTitleDescReplace = ['zaman.com.tr', 'shiftdelete.net'] #Eger link'lerde , var ise facebook share count almak icin digeri kullanisin self.linkContentComma = ['t24.com.tr', 'ntv.com.tr'] #Gormek istemedigin linkleri gec #imageLink'i icinde bulunanlar sadece self.blackListLinkImage ={ } #======================================================================= # Class import self.logHandler = LogHandler("Main") # Database baglantisi self.serverHandler = ServerDatabaseHandler() # Local ve Server calisma ayrimi self.getLinkCountLimit = 1500 if len(sys.argv) == 2: category = sys.argv[1] self.run(category) elif len(sys.argv) == 3: category = sys.argv[1] source = sys.argv[2] self.run(category, source) else: self.run() def getGoogleImage(self, searchTerm): searchTerm = self.multipleReplace(searchTerm, self.turkishDict) searchTerm = '%20'.join(searchTerm.split()[:4]) # Set count to 0 count= 0 imageUrl = "" # Notice that the start changes for each iteration in order to request a new set of images for each loop try: url = 'https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0&q='+searchTerm request = urllib2.Request(url, None, {'User-Agent' : self.userAgent}) response = urllib2.urlopen(request, timeout=10) # Get results using JSON results = simplejson.load(response) data = results['responseData'] dataInfo = data['results'] # Iterate for each result and get unescaped url for myUrl in dataInfo: count = count + 1 if int(myUrl['width']) >= 300 and int(myUrl['height']) >= 300: imageUrl = myUrl['unescapedUrl'] else: imageUrl = "" if imageUrl: break except: imageUrl = "" return imageUrl def multipleReplace(self, text, wordDict): for key in wordDict: text = text.replace(key, wordDict[key]) return text def downloadImage(self, source, link, path): #Bu kaynaklar icin bot olmadigimizi anlatmak icin donen link'e istek yollanir if source == 'cumhuriyet.com.tr': try: ln = 'http://www.cumhuriyet.com.tr' req = urllib2.Request(ln, headers={'User-Agent' : self.userAgent}) html = urllib2.urlopen(req, timeout=5).read() if len(html) < 250: firstIndex = html.find('url=') endIndex = html[firstIndex:].find('"') + firstIndex url = html[firstIndex + len('url='): endIndex] if url and url.find('http') != -1: req = urllib2.Request(url, headers={'User-Agent' : self.userAgent}) html = urllib2.urlopen(req, timeout=5).read() print url except: self.logHandler.logger("run") #Bu kaynaklar icin bot olmadigimizi anlatmak icin donen link'e istek yollanir elif source == 'odatv.com': try: ln = 'http://www.odatv.com' req = urllib2.Request(ln, headers={'User-Agent' : self.userAgent}) html = urllib2.urlopen(req, timeout=5).read() if len(html) < 300: firstIndex = html.find('url=') endIndex = html[firstIndex:].find('"') + firstIndex url = html[firstIndex + len('url='): endIndex] if url and url.find('http') != -1: req = urllib2.Request(url, headers={'User-Agent' : self.userAgent}) html = urllib2.urlopen(req, timeout=5).read() print url except: self.logHandler.logger("run") req = urllib2.Request(link, headers={'User-Agent' : self.userAgent}) try: imageId = "default.png" if source == 'cumhuriyet.com.tr': imageId = link[link[:link.rfind('/')].rfind('/') + 1:].replace('/', '') else: imageId = link[link.rfind('/') + 1:] if imageId.find('?') != -1: imageId = imageId[:imageId.find('?')] elif imageId.find('#') != -1: imageId = imageId[:imageId.find('#')] if not os.path.exists(path + imageId): htmlSource = urllib2.urlopen(req, timeout=5).read() with open(path + imageId, "wb") as f: f.write(htmlSource) return imageId except Exception, error: print error, link return 0
def __init__(self): """ :param configdir: default /sandbox/cfg, then ~/sandbox/cfg if not exists :return: """ self.tools = Tools(self) self.DEFAULT_BRANCH = DEFAULT_BRANCH self.readonly = False # if readonly will not manipulate local filesystem appart from /tmp self.sandbox_python_active = False # means we have a sandboxed environment where python3 works in self.sandbox_lua_active = False # same for lua self.config_changed = False self._cmd_installed = {} # should be the only location where we allow logs to be going elsewhere self.loghandlers = [] self.errorhandlers = [] self.state = None self.__init = False self.debug = False self.log_console = False self.log_level = 15 self._secret = None self.interactive = False self.appname = "installer" self.FORMAT_TIME = "%a %d %H:%M:%S" self.MYCOLORS = { "RED": "\033[1;31m", "BLUE": "\033[1;34m", "CYAN": "\033[1;36m", "GREEN": "\033[0;32m", "GRAY": "\033[0;37m", "YELLOW": "\033[0;33m", "RESET": "\033[0;0m", "BOLD": "\033[;1m", "REVERSE": "\033[;7m", } self.MYCOLORS_IGNORE = { "RED": "", "BLUE": "", "CYAN": "", "GREEN": "", "GRAY": "", "YELLOW": "", "RESET": "", "BOLD": "", "REVERSE": "", } LOGFORMATBASE = "{COLOR}{TIME} {filename:<20}{RESET} -{linenr:4d} - {GRAY}{context:<35}{RESET}: {message}" # DO NOT CHANGE COLOR self.LOGFORMAT = { "DEBUG": LOGFORMATBASE.replace("{COLOR}", "{CYAN}"), "STDOUT": "{message}", # 'INFO': '{BLUE}* {message}{RESET}', "INFO": LOGFORMATBASE.replace("{COLOR}", "{BLUE}"), "WARNING": LOGFORMATBASE.replace("{COLOR}", "{YELLOW}"), "ERROR": LOGFORMATBASE.replace("{COLOR}", "{RED}"), "CRITICAL": "{RED}{TIME} {filename:<20} -{linenr:4d} - {GRAY}{context:<35}{RESET}: {message}", } self.GITREPOS = GITREPOS self._db = None self.installers = Installers() self.installers.osx = OSXInstaller(self) self.installers.ubuntu = UbuntuInstaller(self) self.installers.base = BaseInstaller(self) self.installers.jumpscale = JumpscaleInstaller(self) self.docker = DockerFactory(self) self.redis = RedisTools(self) if self.platform() == "linux": self.platform_is_linux = True self.platform_is_unix = True self.platform_is_osx = False elif "darwin" in self.platform(): self.platform_is_linux = False self.platform_is_unix = True self.platform_is_osx = True elif "win32" in self.platform(): self.platform_is_linux = False self.platform_is_unix = False self.platform_is_osx = False self.platform_is_windows = True else: raise self.tools.exceptions.Base( "platform not supported, only linux or osx and windows for now." ) configdir = self._cfgdir_get() basedir = self._basedir_get() if basedir == "/sandbox" and not os.path.exists(basedir): script = """ set -e cd / sudo mkdir -p /sandbox/cfg sudo chown -R {USERNAME}:{GROUPNAME} /sandbox mkdir -p /usr/local/EGG-INFO sudo chown -R {USERNAME}:{GROUPNAME} /usr/local/EGG-INFO """ args = {} args["USERNAME"] = getpass.getuser() st = os.stat(self.config["DIR_HOME"]) gid = st.st_gid # import is here cause it's only unix # for windows support import grp args["GROUPNAME"] = grp.getgrgid(gid)[0] self.tools.execute(script, interactive=True, args=args, die_if_args_left=True) # Set codedir self.tools.dir_ensure(f"{basedir}/code") self.config_file_path = os.path.join(configdir, "jumpscale_config.toml") self.state_file_path = os.path.join(configdir, "jumpscale_done.toml") if self.tools.exists(self.config_file_path): self._config_load() if not "DIR_BASE" in self.config: return else: self.config = self.config_default_get() self.log_includes = [ i for i in self.config.get("LOGGER_INCLUDE", []) if i.strip().strip("''") != "" ] self.log_excludes = [ i for i in self.config.get("LOGGER_EXCLUDE", []) if i.strip().strip("''") != "" ] self.log_level = self.config.get("LOGGER_LEVEL", 10) # self.log_console = self.config.get("LOGGER_CONSOLE", False) # self.log_redis = self.config.get("LOGGER_REDIS", True) self.debug = self.config.get("DEBUG", False) if "JSXDEBUG" in os.environ: self.debug = True self.debugger = self.config.get("DEBUGGER", "pudb") if os.path.exists( os.path.join(self.config["DIR_BASE"], "bin", "python3.6")): self.sandbox_python_active = True else: self.sandbox_python_active = False self._state_load() self.sshagent = SSHAgent(myenv=self) sys.excepthook = self.excepthook if self.tools.exists("{}/bin".format( self.config["DIR_BASE"])): # To check that Js is on host self.loghandler_redis = LogHandler(self, db=self.db) else: # print("- redis loghandler cannot be loaded") self.loghandler_redis = None self.__init = True
$ export AWS_SECRET_ACCESS_KEY=<Your AWS Secret Access Key> """ import sys import boto from LogHandler import LogHandler #################################################################### # # GLOBALS # #################################################################### # Logger with given log file name (placed in /tmp/) logger = LogHandler(log_filename='s3-image-load.log') #################################################################### # # FUNCTIONS # #################################################################### def connect(): """ Connect to S3 with credentials """ log = logger.get_logger("connect") try: