示例#1
0
class ServerDatabaseHandler:
    def __init__(self):
        try:
            self.logHandler = LogHandler("ServerDatabaseHandle")
			
            self.serverHandler = MySQLdb.connect('127.0.0.1', 'root', 'yourdbpassowrd', 'haberbus');
            self.serverHandler.set_character_set('utf8')
            self.serverHandler.autocommit(True)
			
            self.cursor = self.serverHandler.cursor()
            self.cursor.execute('SET NAMES utf8;')
            self.cursor.execute('SET CHARACTER SET utf8;')
            self.cursor.execute('SET character_set_connection=utf8;')
			
        except:
            self.logHandler.logger("__init__")
	
    def executeQuery(self, query):
        self.cursor.execute(query)
        if query.split()[0].lower() == "select":
            return self.cursor.fetchall()
        else:
            self.serverHandler.commit()
		
    def closeConnection(self):
        self.serverHandler.close()
示例#2
0
文件: kiwoom.py 项目: imtaehyun/stock
    def __init__(self, view):

        self.user = None
        self.view = view

        # Logger 초기화
        logHandler = LogHandler(self.view)
        logHandler.setLevel(logging.DEBUG)
        logger.addHandler(logHandler)

        # 키움증권API OCX Instance 생성
        self.kiwoom = QAxWidget("KHOPENAPI.KHOpenAPICtrl.1")

        # event handler 등록
        self.kiwoom.OnEventConnect[int].connect(self.OnEventConnect)
        self.kiwoom.OnReceiveTrData[str, str, str, str, str, int, str, str, str].connect(self.OnReceiveTrData)
        self.kiwoom.OnReceiveRealData[str, str, str].connect(self.OnReceiveRealData)
        self.kiwoom.OnReceiveMsg[str, str, str, str].connect(self.OnReceiveMsg)
        self.kiwoom.OnReceiveChejanData[str, int, str].connect(self.OnReceiveChejanData)
        self.kiwoom.OnReceiveRealCondition[str, str, str, str].connect(self.OnReceiveRealCondition)
        self.kiwoom.OnReceiveTrCondition[str, str, str, int, int].connect(self.OnReceiveTrCondition)
        self.kiwoom.OnReceiveConditionVer[int, str].connect(self.OnReceiveConditionVer)
示例#3
0
    def __init__(self):
        try:
            self.logHandler = LogHandler("ServerDatabaseHandle")
			
            self.serverHandler = MySQLdb.connect('127.0.0.1', 'root', 'yourdbpassowrd', 'haberbus');
            self.serverHandler.set_character_set('utf8')
            self.serverHandler.autocommit(True)
			
            self.cursor = self.serverHandler.cursor()
            self.cursor.execute('SET NAMES utf8;')
            self.cursor.execute('SET CHARACTER SET utf8;')
            self.cursor.execute('SET character_set_connection=utf8;')
			
        except:
            self.logHandler.logger("__init__")
示例#4
0
import random
import time
import json

import requests
from requests.adapters import HTTPAdapter
from LogHandler import LogHandler

log = LogHandler('downloader', file=False)


def get_proxy():
    bak_url = 'http://123.207.35.36:5010/get/'
    url = 'http://127.0.0.1:5010/get'
    try:
        proxy = requests.get(url).text
    except Exception as e:
        print('本地获取代理失败,远程从获取')
        proxy = requests.get(bak_url).text
    return proxy


class Downloader(object):
    def __init__(self, *args, **kwargs):
        pass

    @property
    def user_agent(self):
        """
        return an User-Agent at random
        :return:
-------------------------------------------------
   File Name:	 CheckProxy
   Description :   used for check getFreeProxy.py
   Author :	    Tc
   date:	      2018/12/24
-------------------------------------------------
"""
__author__ = 'Tc'

import sys
from getFreeProxy import GetFreeProxy
from utilFunction import verifyProxyFormat

from LogHandler import LogHandler

log = LogHandler('check_proxy', file=False)


class CheckProxy(object):
    @staticmethod
    def checkAllGetProxyFunc():
        """
	    检查getFreeProxy所有代理获取函数运行情况
	    Returns:
	        None
	    """
        import inspect
        member_list = inspect.getmembers(GetFreeProxy,
                                         predicate=inspect.isfunction)
        proxy_count_dict = dict()
        for func_name, func in member_list:
示例#6
0
class DatabaseHandler:
    """ database handler class """
    def __init__(self, path, log_level=0):
        """
        :param path: path to the db, including db name
        :param log_level: 0-4, Debug...Critical, see LogHandler class
        """
        self.__path = path
        self.__tables = {}
        self.__keywords = {}
        self.__lh = LogHandler(log_level)

    def open(self):
        """ open database """
        self.__db = sqlite3.connect(self.__path)
        self.__dbcursor = self.__db.cursor()
        self.__lh.info("DB is opened: " + self.__path)

    def close(self):
        """ close database """
        self.__db.close()
        self.__lh.info("DB is closed: " + self.__path)

    def get_table_name(self, name):
        """
        get columns of the selected table
        :param name: tablen ame
        :return: list of columns
        """
        self.__dbcursor.execute("PRAGMA table_info('" + name + "')")
        fetch = self.__dbcursor.fetchall()
        columns = []
        for col in fetch:
            columns.append(col[1])
        self.__lh.debug("Table columns: " + ", ".join(columns))
        return columns

    def set_used_table(self, name):
        """
        set which table is used from database
        :param name: table name
        """
        self.__tables[name] = self.get_table_name(name)
        self.__lh.info("New table is set to used: " + name)

    def release_used_table(self, name):
        """
        release the given table
        :param name: table name
        """
        del self.__tables[name]
        self.__lh.info("Table is released from used tables: " + name)

    def clear_used_tables(self):
        """ release all used tables """
        self.__tables.clear()
        self.__lh.info("All tables are released from used tables")

    def execute_query(self, query):
        """
        executes the given sqlite query string
        :param query: sqlite query string
        :return: db's response to the query
        """
        try:
            self.__dbcursor.execute(query)
            self.__lh.info("Executed query: " + query)
        except Exception as e:
            self.__lh.exceptionHandling(e)
        return self.__dbcursor.fetchall()

    def commit(self):
        """ apply changes done to database """
        self.__db.commit()

    def insert(self, table_name, data, columns=[]):
        """
        builds and executes INSERT query
        :param table_name: used table's name
        :param data: list of strings, data of a row to be inserted to the given table
        :param columns: list of columns where the given data should be inserted
        """
        if self.is_registered(table_name, data, columns) == True:
            self.__lh.warning("Record already inserted")
        else:
            try:
                keyword = "INSERT INTO"
                keyword2 = "VALUES"
                cols = "(" + ",".join(columns) + ")"
                args = "(" + ",".join(data) + ")"
                query = " ".join([keyword, table_name, cols, keyword2, args
                                  ]) + ";"
                self.execute_query(query)
            except Exception as e:
                self.__lh.exceptionHandling(e)

    def select(self, table_name, cols=[], condition=[], distinct=False):
        """
        builds and executes SELECT query
        :param table_name: used table's name
        :param cols: list of columns should be selected
        :param condition: list of tuples, conditions of the query, use tupleListToStatement function to generate this string list
        :param distinct: set True if SELECT DISTICT query is required
        :return: list of tuple list, tuple list contains the selected data of one line
        """
        zipped = []
        try:
            if cols == []:
                raise Exception("Empty selected columns list")

            keyword = "SELECT" if distinct == False else "SELECT DISTINCT"
            if cols == ["*"]:
                args = "*"
            else:
                args = ", ".join(cols)
            table = "FROM " + table_name
            if condition != []:
                where = self.where_clause(condition)
            else:
                where = ""
            query = " ".join([keyword, args, table, where]) + ";"
            rows = self.execute_query(query)
            for row in rows:
                self.__lh.debug("Select raw return values: " + (str(row)))
                zipped.append(list(zip(cols, row)))
                line = ""
                for index in range(0, len(cols)):
                    line = line + str(cols[index]) + " = " + str(row[index])
                    if index is not (len(cols) - 1):
                        line += ", "
                self.__lh.debug("Returned line: " + line)
            self.__lh.debug(zipped)
        except Exception as e:
            self.__lh.exceptionHandling(e)
        finally:
            return zipped

    def update(self, table_name, data=[], condition=[]):
        """
        builds and executes UPDATE query
        :param table_name: used table's name
        :param data: list of tuple, see tupleListToStatement function
        :param condition: list of tuple, see whereClause
        """
        keyword = "UPDATE " + table_name + " SET"
        for i in range(0, len(data)):
            data[i] = (data[i][0], "=", data[i][1])
        d_args = self.tuple_list_to_statement(data)
        c_args = self.where_clause(condition)
        query = " ".join([keyword, d_args, c_args]) + ";"
        self.execute_query(query)

    def delete(self, table_name, condition=[]):
        """
        builds and executes DELETE query
        :param table_name: used table's name
        :param condition: list of tuple, see whereClause
        """
        keyword = "DELETE FROM " + table_name
        c_args = self.where_clause(condition)
        query = " ".join([keyword, c_args]) + ";"
        self.execute_query(query)

    def where_clause(self, condition=[]):
        """
        builds WHERE clause
        :param condition: list of tuple, see tupleListToStatement function
        :return: WHERE clause string
        """
        clause = "WHERE " + self.tuple_list_to_statement(condition)
        self.__lh.debug("WHERE clause: " + clause)
        return clause

    def and_clause(self):
        # TODO
        pass

    def or_clause(self):
        # TODO
        pass

    def like_clause(self):
        # TODO
        pass

    def glob_clause(self):
        # TODO
        pass

    def limit_clause(self):
        # TODO
        pass

    def order_by_clause(self):
        # TODO
        pass

    def group_by_clause(self):
        # TODO
        pass

    def having_clause(self):
        # TODO
        pass

    def tuple_list_to_statement(self, tuple_list=[]):
        """
        builds string from tuple
        :param tuple_list: tuple list, it should look like this when building conditions: [(column, relation operator, value), (c,ro,v),...]
        :return: list of strings
        """
        statement = ""
        for t in tuple_list:
            tmp = str(t[0]) + " " + str(t[1]) + " " + str(t[2])
            if statement == "":
                statement = tmp
            else:
                statement = ",".join([statement, tmp])
        return statement

    def is_registered(self, table_name, data_list=[], col_list=[]):
        ret = False
        new_list = []
        self.__lh.debug(data_list)
        self.__lh.debug(col_list)
        for x in range(len(data_list)):
            new_list.append((data_list[x], "=", col_list[x]))

        result = []
        #result = self.select(table_name,"*",self.tuple_list_to_statement(new_list))
        if result != []:
            ret = True
        return ret
示例#7
0
 def __init__(self, link_queue, data_queue, thread_name, proxies):
     threading.Thread.__init__(self, name = thread_name)
     self.log = LogHandler('detail_data_crawler')
     self.link_queue = link_queue
     self.data_queue = data_queue
     self.proxies = proxies
示例#8
0
    def __init__(self):

        self.logHandler = LogHandler("Main")
        self.serverHandler = ServerDatabaseHandler()
        self.run()
示例#9
0
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import requests, sys, time
sys.path.append('../')
from getProxy.manipulateProxy import get, delet
from LogHandler import LogHandler
reqErro = LogHandler('neterro', stream=False)


class request(object):
    """docstring for request"""
    def __init__(self, data):
        super(request, self).__init__()
        self.data = data

    def judge(self):
        if self.data['url'].find('https://www.amazon.com') < 0:
            self.data['url'] = 'https://www.amazon.com/' + self.data['url']

    def cport(self):
        port = get()
        return {
            "http": "http://" + port,
            "https": "http://" + port,
        }

    def webGet(self):
        try:
            proxies = []
 def __init__(self):
     from db.DbClient import DbClient
     self.db = DbClient()
     self.raw_proxy_queue = 'raw_proxy'
     self.log = LogHandler('proxy_manager')
     self.useful_proxy_queue = 'useful_proxy'
示例#11
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('refresh_schedule')
示例#12
0
 def __init__(self, queue, item_dict):
     ProxyManager.__init__(self)
     Thread.__init__(self)
     self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
     self.queue = queue
     self.item_dict = item_dict
示例#13
0
 def get(self, queue_id):
     log_handler = LogHandler()
     result = log_handler.GetQueueItemsById(queue_id)
     return result, 200
示例#14
0
 def get(self):
     log_handler = LogHandler()
     result = LogHandler.GetQueueStats(log_handler)
     return result, 200
示例#15
0
    "zh-CN,zh;q=0.8",
    "Cookie":
    'aws-target-static-id=1497838335626-19381; s_vn=1529374335856%26vn%3D1; regStatus=pre-register; aws_lang=cn; aws-target-data=%7B%22support%22%3A%221%22%7D; aws-target-visitor-id=1497838335629-9703.24_4; amznacsleftnav-1e4dfe77-0d78-3527-b54a-f23cc2cb231e=1; AMCVS_4A8581745834114C0A495E2B%40AdobeOrg=1; AMCV_4A8581745834114C0A495E2B%40AdobeOrg=-894706358%7CMCIDTS%7C17415%7CMCMID%7C15071633912452894915662900002460765377%7CMCOPTOUT-1504585615s%7CNONE%7CMCAID%7C2CD68D00852A8569-40000301C00003C7%7CvVersion%7C2.3.0; s_lv=1504578420898; appstore-devportal-locale=zh_CN; s_vnum=1931133517751%26vn%3D4; s_nr=1504749318700-Repeat; s_dslv=1504749318701; s_ppv=85; at-main=Atza|IwEBIDDWvxoMHsifLiTWazK1cSAb6sPs44jFlHu4oL3LodNKVsU8NGdSm5K8FdmMWr__wQDtiusZ88pZrsaXKPN42AIgcztHnSaeYEUHhgy0P3sil04wpCLe2F1m_HK7eZDg0D6bxXBzGOttA8lN4Um0cObimx2j07DNb1KHtgA465FpjPDMqWfHdw_Uvy4vKwfqIzwSp2nO3iqK1VlRDQlcNCHFi33PfESso3Up1yHvkhiqBlVSVl-GSm7a49vnTMX-xGIhPWqDBCSEunEF8nPTZ1Y4zM0RCnQojWjaCiesNq_3iy3PgL-LKEEmxTNe6RixwK3d2Swd8dWjcy743XEdxa4xUeiD9wtrT8Zx0hyWIYejSDm7W0REBeViHrYNw8Pr927I5Vrlm6rN23psIff2--yF; sess-at-main="goK1tVYcgb7R2XcYasQ8/+2ABnZO2hdIYIDDrbHkOiM="; x-wl-uid=1XXT7ohIeJZyY/7sC/ytnTHT+Vwn62xt/Jqr4l2xe+WetZvII2HlOUcCgEXfKRBNwgbOTZrrdaMcZJNwjXVH48GrQ2/ROuA6CLbVJ1xZ9Jo0PgAxPQ9L1NbmGgdG1oBrv+QyImd7kzKk=; skin=noskin; JSESSIONID=5AF6FA51D68211D42C8B1E4F272968D3; s_sess=%20s_ppvl%3D%3B%20s_ppv%3DUS%25253AAZ%25253ASOA-overview-seeall%252C92%252C37%252C3448%252C2880%252C1348%252C1920%252C1080%252C0.67%252CL%3B%20s_cc%3Dtrue%3B%20c_m%3DundefinedTyped/BookmarkedTyped/Bookmarked%3B%20s_sq%3Dacsus-prod%253D%252526pid%25253D508510%252526pidt%25253D1%252526oid%25253Dhttps%2525253A%2525252F%2525252Fwww.amazon.com%2525252Fgp%2525252Fhelp%2525252Fcustomer%2525252Fforums%2525252Fkindleqna%2525252Fref%2525253Dhp_gt_comp_ss_forum_Kindle%252526ot%25253DA%3B; s_pers=%20s_fid%3D0B584A252E26D24B-0508DABA67938BE2%7C1662514482807%3B%20s_dl%3D1%7C1507518928552%3B%20gpv_page%3DUS%253ASC%253A%2520SellerCentralLogin%7C1507518928555%3B%20s_ev15%3D%255B%255B%2527Typed/Bookmarked%2527%252C%25271507517128514%2527%255D%252C%255B%2527Typed/Bookmarked%2527%252C%25271507517128557%2527%255D%255D%7C1665283528557%3B; x-main="Ihat36AUznBqU@bAXSLMRPKxSgCxnH1bEUQiqvtsIVRnH75aplq29jnQqj?LHESj"; lc-main=en_US; session-token="STQf3qhsr7tRS9RXdV+B5iLxVuHXTWxw3sz5jRhYxXQ8OePnWMVSkTiU5Wot7+yBX2/sYgnK2b585EUmLISbef7iNSg9mAtKSxMrAgH3WlNomIsvZw5cciATBtgld6xxTLN0L4qRqPXGW6ucpWuX9M0yI4EWddIswJKbCJK56B27pyn72QR0wPvzVVODwGZRxTxjBywnAJfXhbq/zyaIytgH0BvgASM5FAp6FyMVO7E/dq31m2t0EQY1Jx+jmFu6/mZy03RzzDPWKOZKlpVMSg=="; ubid-main=131-0601013-2724535; session-id-time=2082787201l; session-id=136-1823294-1208816; csm-hit=TC76C8PW39N5MY6AKGWM+s-FTFB6ESPHJ74V4N2XPGW|1507777709001'
}

csheaders = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
    "Host": "www.fanzle.com",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "X-Requested-With": "XMLHttpRequest"
}

keylog = LogHandler('keylog')
toplog = LogHandler('toplog')
keylog = LogHandler('key')
conn = MongoClient(host='127.0.0.1', port=27017)
amxpage = conn['keySearch']


class getPage(object):
    """docstring for getPage"""
    def __init__(self, url):
        super(getPage, self).__init__()
        self.url = url

    def generateData(self):
        return {'url': self.url, 'headers': headers, 'timeout': 3, 'ac': None}
示例#16
0
	def __init__(self):
		
		#=======================================================================
		# Configuration values
		#=======================================================================
		self.wwwPath = '/var/www/html/'
		#Resimler icin
		self.linkImagePath = self.wwwPath + 'imageslink/'
		# Url request'lerinde kullanilan agent
		self.userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"
		# Alinan ve islenen linkleri goster
		self.showLink = False
		#links database tablosu icin son ek
		self.yearMonth = ""
		#Turkce karakter
		self.turkishDict = {'ı': 'i', 'ü': 'u', 'ö':'o', 'ş':'s', 'ç':'c', 'ğ':'g'}
			
		#=======================================================================
		# Configuration For Requests
		#=======================================================================
		#Bazi sitelerin haber resimleri icin class ile alinir
		#birden fazla class girilebilir value kismina , karakteri ile bitisik yaz
		self.imageClassForSources = {
									'http://www.aksam.com.tr':'image', 
									'http://www.trthaber.com':'image', 
									'http://www.trthaber.com/haber/kultur-sanat/':'image',
									'http://amkspor.sozcu.com.tr':'in_image',
									'http://www.sosyalmedya.co':'attachment-large wp-post-image', 
									'http://www.teknolojioku.com':'newsImage', 
									'http://www.webrazzi.com': 'post-content',
									'http://www.taraf.com.tr/yazarlar/':'info',
									'http://www.donanimhaber.com':'entry',
									'http://www.bugun.com.tr':'image',
									'http://www.milliyet.com.tr/Yazar.aspx?aType=Yazarlar':'image',
									'http://www.gazetevatan.com/yazarlar/':'aimg',
									'http://www.cumhuriyet.com.tr/yazarlar':'author',
									'http://www.sabah.com.tr/Yazarlar':'iBox',
									'http://www.ensonhaber.com':'mansetresmi',
									'http://www.yenisafak.com.tr/yazarlar/':'picture',
									'http://www.haberler.com':'image',
									'http://www.internethaber.com':'item img active',
									'http://haber.sol.org.tr':'singlenews-image',
									'http://haber.sol.org.tr':'singlenews-image',
									'http://www.yeniakit.com.tr/yazarlar':'au-top-right',
									'http://www.takvim.com.tr':'haberImg',
									'http://www.haberturk.com':'image',
									'http://www.haber7.com':'image_src',
									'http://www.mynet.com':'twitter:image',
									}
		#Farkli meta etiketi icin tanim
		self.descMetaTypes = {
								'http://www.radikal.com.tr/yazarlar':{"name":"twitter:description"}, 
								'http://www.radikal.com.tr/kultur':{"name":"twitter:description"}, 
								'http://webtv.radikal.com.tr':{"name":"twitter:description"}, 
								'http://www.radikal.com.tr':{"name":"twitter:description"}, 
								'http://www.mynet.com/teknoloji':{"property":"og:description"}, 
								'http://www.mynet.com':{"property":"og:description"}, 
								'http://webtv.hurriyet.com.tr':{"property":"og:description"}, 
								'http://www.haber7.com':{"name":"twitter:description"},
								'http://www.samanyoluhaber.com':{"property":"og:description"},
							}

		#Desc'i alma title'i ata
		self.notGetDesc = []

		#Link'lerdeki ? karakteri icin
		self.containQuestionCharacter = [
										'http://www.posta.com.tr', 
										'http://www.odatv.com',
										]

		#Link sonundaki / karakteri silinmeyecek
		self.notDeletedBackslashCharacter = [
										'http://www.posta.com.tr', 
										'http://www.odatv.com',
										'http://www.taraf.com.tr',
										'http://www.taraf.com.tr/yazarlar/',
										'http://www.samanyoluhaber.com',
										'http://amkspor.sozcu.com.tr',
										'http://www.haberler.com',
										'http://www.diken.com.tr',
										'http://www.fizikist.com',
										]
		#Haberden alinan resim c*k kucuk ise yada yanlis ise title baz alinarak google'dan resim cekilir
		#Google image limit'leri oldugundan surekli cekince yasaklandi
		self.getGoogleImageList = []

		#Link'in image link'inin icinde alindiktan sonra replace edilmesi gerek bir sey var ise
		self.replaceStringForLink = {
									'shiftdelete.net': ('/shiftdelete.net', '/s01.shiftdelete.net/img/general_b'), 
									}

		#Asagiya girilen keyword'lar link icinde geciyor ise o link alinmaz
		self.notGetLinkIfContainThisKeyword = ['javascript:', ]

		#Hotlink korumasi olanlar bunlarin resimleri link vermek yerine indiriliyor
		self.hotlinks = ['ensonhaber.com', 'bugun.com.tr', 'internethaber.com', 'haberler.com', 'odatv.com', 'cumhuriyet.com.tr', 'zaman.com.tr', 'donanimhaber.com', 'ajansspor.com', 'haber.sol.org.tr']
		
		#Eger karakterler bozuk geliyor ise Request tpye
		self.requestTypes = ['amkspor.sozcu.com.tr', 'yenisafak.com.tr', 'indir.com']

		#Eger source'u encoding geliyor ise baska bir request yapilir
		self.encodePageSource = ['mynet.com', 'trthaber.com']

		#degeri 0 ise www eklenir , 1 ise www silinir, 2 ise http://www silinir
		self.getTweetCountFix = {'yenisafak.com.tr':0, 'bigumigu.com':1, 'webrazzi.com':2, 'odatv.com':2}
		
		#Eger title, desc karakterler bozuk geliyor temizlik icin
		self.contentTitleDescReplace = ['zaman.com.tr', 'shiftdelete.net']
		
		#Eger link'lerde , var ise facebook share count almak icin digeri kullanisin
		self.linkContentComma = ['t24.com.tr', 'ntv.com.tr']

		#Gormek istemedigin linkleri gec #imageLink'i icinde bulunanlar sadece
		self.blackListLinkImage ={
								}
		

		#=======================================================================
		# Class import
		self.logHandler = LogHandler("Main")

		# Database baglantisi
		self.serverHandler = ServerDatabaseHandler()
		
		# Local ve Server calisma ayrimi
		self.getLinkCountLimit = 1500

		if len(sys.argv) == 2:
			category = sys.argv[1]
			self.run(category)
		elif len(sys.argv) == 3:
			category = sys.argv[1]
			source = sys.argv[2]
			self.run(category, source)
		else:
			self.run()
示例#17
0
class Main:
	def __init__(self):
		
		#=======================================================================
		# Configuration values
		#=======================================================================
		self.wwwPath = '/var/www/html/'
		#Resimler icin
		self.linkImagePath = self.wwwPath + 'imageslink/'
		# Url request'lerinde kullanilan agent
		self.userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"
		# Alinan ve islenen linkleri goster
		self.showLink = False
		#links database tablosu icin son ek
		self.yearMonth = ""
		#Turkce karakter
		self.turkishDict = {'ı': 'i', 'ü': 'u', 'ö':'o', 'ş':'s', 'ç':'c', 'ğ':'g'}
			
		#=======================================================================
		# Configuration For Requests
		#=======================================================================
		#Bazi sitelerin haber resimleri icin class ile alinir
		#birden fazla class girilebilir value kismina , karakteri ile bitisik yaz
		self.imageClassForSources = {
									'http://www.aksam.com.tr':'image', 
									'http://www.trthaber.com':'image', 
									'http://www.trthaber.com/haber/kultur-sanat/':'image',
									'http://amkspor.sozcu.com.tr':'in_image',
									'http://www.sosyalmedya.co':'attachment-large wp-post-image', 
									'http://www.teknolojioku.com':'newsImage', 
									'http://www.webrazzi.com': 'post-content',
									'http://www.taraf.com.tr/yazarlar/':'info',
									'http://www.donanimhaber.com':'entry',
									'http://www.bugun.com.tr':'image',
									'http://www.milliyet.com.tr/Yazar.aspx?aType=Yazarlar':'image',
									'http://www.gazetevatan.com/yazarlar/':'aimg',
									'http://www.cumhuriyet.com.tr/yazarlar':'author',
									'http://www.sabah.com.tr/Yazarlar':'iBox',
									'http://www.ensonhaber.com':'mansetresmi',
									'http://www.yenisafak.com.tr/yazarlar/':'picture',
									'http://www.haberler.com':'image',
									'http://www.internethaber.com':'item img active',
									'http://haber.sol.org.tr':'singlenews-image',
									'http://haber.sol.org.tr':'singlenews-image',
									'http://www.yeniakit.com.tr/yazarlar':'au-top-right',
									'http://www.takvim.com.tr':'haberImg',
									'http://www.haberturk.com':'image',
									'http://www.haber7.com':'image_src',
									'http://www.mynet.com':'twitter:image',
									}
		#Farkli meta etiketi icin tanim
		self.descMetaTypes = {
								'http://www.radikal.com.tr/yazarlar':{"name":"twitter:description"}, 
								'http://www.radikal.com.tr/kultur':{"name":"twitter:description"}, 
								'http://webtv.radikal.com.tr':{"name":"twitter:description"}, 
								'http://www.radikal.com.tr':{"name":"twitter:description"}, 
								'http://www.mynet.com/teknoloji':{"property":"og:description"}, 
								'http://www.mynet.com':{"property":"og:description"}, 
								'http://webtv.hurriyet.com.tr':{"property":"og:description"}, 
								'http://www.haber7.com':{"name":"twitter:description"},
								'http://www.samanyoluhaber.com':{"property":"og:description"},
							}

		#Desc'i alma title'i ata
		self.notGetDesc = []

		#Link'lerdeki ? karakteri icin
		self.containQuestionCharacter = [
										'http://www.posta.com.tr', 
										'http://www.odatv.com',
										]

		#Link sonundaki / karakteri silinmeyecek
		self.notDeletedBackslashCharacter = [
										'http://www.posta.com.tr', 
										'http://www.odatv.com',
										'http://www.taraf.com.tr',
										'http://www.taraf.com.tr/yazarlar/',
										'http://www.samanyoluhaber.com',
										'http://amkspor.sozcu.com.tr',
										'http://www.haberler.com',
										'http://www.diken.com.tr',
										'http://www.fizikist.com',
										]
		#Haberden alinan resim c*k kucuk ise yada yanlis ise title baz alinarak google'dan resim cekilir
		#Google image limit'leri oldugundan surekli cekince yasaklandi
		self.getGoogleImageList = []

		#Link'in image link'inin icinde alindiktan sonra replace edilmesi gerek bir sey var ise
		self.replaceStringForLink = {
									'shiftdelete.net': ('/shiftdelete.net', '/s01.shiftdelete.net/img/general_b'), 
									}

		#Asagiya girilen keyword'lar link icinde geciyor ise o link alinmaz
		self.notGetLinkIfContainThisKeyword = ['javascript:', ]

		#Hotlink korumasi olanlar bunlarin resimleri link vermek yerine indiriliyor
		self.hotlinks = ['ensonhaber.com', 'bugun.com.tr', 'internethaber.com', 'haberler.com', 'odatv.com', 'cumhuriyet.com.tr', 'zaman.com.tr', 'donanimhaber.com', 'ajansspor.com', 'haber.sol.org.tr']
		
		#Eger karakterler bozuk geliyor ise Request tpye
		self.requestTypes = ['amkspor.sozcu.com.tr', 'yenisafak.com.tr', 'indir.com']

		#Eger source'u encoding geliyor ise baska bir request yapilir
		self.encodePageSource = ['mynet.com', 'trthaber.com']

		#degeri 0 ise www eklenir , 1 ise www silinir, 2 ise http://www silinir
		self.getTweetCountFix = {'yenisafak.com.tr':0, 'bigumigu.com':1, 'webrazzi.com':2, 'odatv.com':2}
		
		#Eger title, desc karakterler bozuk geliyor temizlik icin
		self.contentTitleDescReplace = ['zaman.com.tr', 'shiftdelete.net']
		
		#Eger link'lerde , var ise facebook share count almak icin digeri kullanisin
		self.linkContentComma = ['t24.com.tr', 'ntv.com.tr']

		#Gormek istemedigin linkleri gec #imageLink'i icinde bulunanlar sadece
		self.blackListLinkImage ={
								}
		

		#=======================================================================
		# Class import
		self.logHandler = LogHandler("Main")

		# Database baglantisi
		self.serverHandler = ServerDatabaseHandler()
		
		# Local ve Server calisma ayrimi
		self.getLinkCountLimit = 1500

		if len(sys.argv) == 2:
			category = sys.argv[1]
			self.run(category)
		elif len(sys.argv) == 3:
			category = sys.argv[1]
			source = sys.argv[2]
			self.run(category, source)
		else:
			self.run()
	
	def getGoogleImage(self, searchTerm):
		searchTerm = self.multipleReplace(searchTerm, self.turkishDict)
		searchTerm = '%20'.join(searchTerm.split()[:4])
		
		# Set count to 0
		count= 0
		
		imageUrl = ""
		# Notice that the start changes for each iteration in order to request a new set of images for each loop
		try:
			url = 'https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0&q='+searchTerm
			request = urllib2.Request(url, None, {'User-Agent' : self.userAgent})
			response = urllib2.urlopen(request, timeout=10)
			# Get results using JSON
			results = simplejson.load(response)
			data = results['responseData']
			dataInfo = data['results']
			
			# Iterate for each result and get unescaped url
			for myUrl in dataInfo:
				count = count + 1
				if int(myUrl['width']) >= 300 and int(myUrl['height']) >= 300:
					imageUrl =  myUrl['unescapedUrl']
				else:
					imageUrl = ""
				if imageUrl:
					break
		except:
			imageUrl = ""
			
		return imageUrl
			
	def multipleReplace(self, text, wordDict):
		for key in wordDict:
			text = text.replace(key, wordDict[key])
		return text
		
	def downloadImage(self, source, link, path):
		#Bu kaynaklar icin bot olmadigimizi anlatmak icin donen link'e istek yollanir
		if source == 'cumhuriyet.com.tr':
			try:
				ln = 'http://www.cumhuriyet.com.tr'
				req = urllib2.Request(ln, headers={'User-Agent' : self.userAgent})
				html = urllib2.urlopen(req, timeout=5).read()
				if len(html) < 250:
					firstIndex = html.find('url=')
					endIndex = html[firstIndex:].find('"') + firstIndex
					
					url = html[firstIndex + len('url='): endIndex]
					if url and url.find('http') != -1:
						req = urllib2.Request(url, headers={'User-Agent' : self.userAgent})
						html = urllib2.urlopen(req, timeout=5).read()
						print url
			except:
				self.logHandler.logger("run")

		#Bu kaynaklar icin bot olmadigimizi anlatmak icin donen link'e istek yollanir
		elif source == 'odatv.com':
			try:
				ln = 'http://www.odatv.com'
				req = urllib2.Request(ln, headers={'User-Agent' : self.userAgent})
				html = urllib2.urlopen(req, timeout=5).read()
				if len(html) < 300:
					firstIndex = html.find('url=')
					endIndex = html[firstIndex:].find('"') + firstIndex
					
					url = html[firstIndex + len('url='): endIndex]
					if url and url.find('http') != -1:
						req = urllib2.Request(url, headers={'User-Agent' : self.userAgent})
						html = urllib2.urlopen(req, timeout=5).read()
						print url
			except:
				self.logHandler.logger("run")
		
					
		req = urllib2.Request(link, headers={'User-Agent' : self.userAgent})
		try:
			imageId = "default.png"
			if source == 'cumhuriyet.com.tr':
				imageId = link[link[:link.rfind('/')].rfind('/') + 1:].replace('/', '')
			else:
				imageId = link[link.rfind('/') + 1:]
			if imageId.find('?') != -1:
				imageId = imageId[:imageId.find('?')]
			elif imageId.find('#') != -1:
				imageId = imageId[:imageId.find('#')]
				
			if not os.path.exists(path + imageId):
				htmlSource = urllib2.urlopen(req, timeout=5).read()
				with open(path + imageId, "wb") as f:
					f.write(htmlSource)
			return imageId
		except Exception, error:
			print error, link
			return 0
示例#18
0
    def __init__(self):
        """

        :param configdir: default /sandbox/cfg, then ~/sandbox/cfg if not exists
        :return:
        """
        self.tools = Tools(self)
        self.DEFAULT_BRANCH = DEFAULT_BRANCH
        self.readonly = False  # if readonly will not manipulate local filesystem appart from /tmp
        self.sandbox_python_active = False  # means we have a sandboxed environment where python3 works in
        self.sandbox_lua_active = False  # same for lua
        self.config_changed = False
        self._cmd_installed = {}
        # should be the only location where we allow logs to be going elsewhere
        self.loghandlers = []
        self.errorhandlers = []
        self.state = None
        self.__init = False
        self.debug = False
        self.log_console = False
        self.log_level = 15
        self._secret = None

        self.interactive = False

        self.appname = "installer"

        self.FORMAT_TIME = "%a %d %H:%M:%S"

        self.MYCOLORS = {
            "RED": "\033[1;31m",
            "BLUE": "\033[1;34m",
            "CYAN": "\033[1;36m",
            "GREEN": "\033[0;32m",
            "GRAY": "\033[0;37m",
            "YELLOW": "\033[0;33m",
            "RESET": "\033[0;0m",
            "BOLD": "\033[;1m",
            "REVERSE": "\033[;7m",
        }

        self.MYCOLORS_IGNORE = {
            "RED": "",
            "BLUE": "",
            "CYAN": "",
            "GREEN": "",
            "GRAY": "",
            "YELLOW": "",
            "RESET": "",
            "BOLD": "",
            "REVERSE": "",
        }

        LOGFORMATBASE = "{COLOR}{TIME} {filename:<20}{RESET} -{linenr:4d} - {GRAY}{context:<35}{RESET}: {message}"  # DO NOT CHANGE COLOR

        self.LOGFORMAT = {
            "DEBUG":
            LOGFORMATBASE.replace("{COLOR}", "{CYAN}"),
            "STDOUT":
            "{message}",
            # 'INFO': '{BLUE}* {message}{RESET}',
            "INFO":
            LOGFORMATBASE.replace("{COLOR}", "{BLUE}"),
            "WARNING":
            LOGFORMATBASE.replace("{COLOR}", "{YELLOW}"),
            "ERROR":
            LOGFORMATBASE.replace("{COLOR}", "{RED}"),
            "CRITICAL":
            "{RED}{TIME} {filename:<20} -{linenr:4d} - {GRAY}{context:<35}{RESET}: {message}",
        }

        self.GITREPOS = GITREPOS
        self._db = None

        self.installers = Installers()
        self.installers.osx = OSXInstaller(self)
        self.installers.ubuntu = UbuntuInstaller(self)
        self.installers.base = BaseInstaller(self)
        self.installers.jumpscale = JumpscaleInstaller(self)

        self.docker = DockerFactory(self)
        self.redis = RedisTools(self)

        if self.platform() == "linux":
            self.platform_is_linux = True
            self.platform_is_unix = True
            self.platform_is_osx = False
        elif "darwin" in self.platform():
            self.platform_is_linux = False
            self.platform_is_unix = True
            self.platform_is_osx = True
        elif "win32" in self.platform():
            self.platform_is_linux = False
            self.platform_is_unix = False
            self.platform_is_osx = False
            self.platform_is_windows = True
        else:
            raise self.tools.exceptions.Base(
                "platform not supported, only linux or osx and windows for now."
            )

        configdir = self._cfgdir_get()
        basedir = self._basedir_get()

        if basedir == "/sandbox" and not os.path.exists(basedir):
            script = """
            set -e
            cd /
            sudo mkdir -p /sandbox/cfg
            sudo chown -R {USERNAME}:{GROUPNAME} /sandbox
            mkdir -p /usr/local/EGG-INFO
            sudo chown -R {USERNAME}:{GROUPNAME} /usr/local/EGG-INFO
            """
            args = {}
            args["USERNAME"] = getpass.getuser()
            st = os.stat(self.config["DIR_HOME"])
            gid = st.st_gid
            # import is here cause it's only unix
            # for windows support
            import grp
            args["GROUPNAME"] = grp.getgrgid(gid)[0]
            self.tools.execute(script,
                               interactive=True,
                               args=args,
                               die_if_args_left=True)

        # Set codedir
        self.tools.dir_ensure(f"{basedir}/code")
        self.config_file_path = os.path.join(configdir,
                                             "jumpscale_config.toml")
        self.state_file_path = os.path.join(configdir, "jumpscale_done.toml")

        if self.tools.exists(self.config_file_path):
            self._config_load()
            if not "DIR_BASE" in self.config:
                return
        else:
            self.config = self.config_default_get()

        self.log_includes = [
            i for i in self.config.get("LOGGER_INCLUDE", [])
            if i.strip().strip("''") != ""
        ]
        self.log_excludes = [
            i for i in self.config.get("LOGGER_EXCLUDE", [])
            if i.strip().strip("''") != ""
        ]
        self.log_level = self.config.get("LOGGER_LEVEL", 10)
        # self.log_console = self.config.get("LOGGER_CONSOLE", False)
        # self.log_redis = self.config.get("LOGGER_REDIS", True)
        self.debug = self.config.get("DEBUG", False)
        if "JSXDEBUG" in os.environ:
            self.debug = True
        self.debugger = self.config.get("DEBUGGER", "pudb")

        if os.path.exists(
                os.path.join(self.config["DIR_BASE"], "bin", "python3.6")):
            self.sandbox_python_active = True
        else:
            self.sandbox_python_active = False

        self._state_load()

        self.sshagent = SSHAgent(myenv=self)

        sys.excepthook = self.excepthook
        if self.tools.exists("{}/bin".format(
                self.config["DIR_BASE"])):  # To check that Js is on host
            self.loghandler_redis = LogHandler(self, db=self.db)
        else:
            # print("- redis loghandler cannot be loaded")
            self.loghandler_redis = None

        self.__init = True
示例#19
0
        $ export AWS_SECRET_ACCESS_KEY=<Your AWS Secret Access Key>

"""

import sys
import boto
from LogHandler import LogHandler

####################################################################
#
# GLOBALS
#
####################################################################

# Logger with given log file name (placed in /tmp/)
logger = LogHandler(log_filename='s3-image-load.log')

####################################################################
#
# FUNCTIONS
#
####################################################################


def connect():
    """
        Connect to S3 with credentials
    """
    log = logger.get_logger("connect")

    try: