예제 #1
0
    def __init__(self, logger=None):
        self.logger = logger or get_logger(
            'Monitor', True)  # try to get logger from engine

        self.StrategySpiderMap_dict = GlobalConfig.StrategySpiderMap_dict  # will be used in spider plugin s

        self.globalize_strategystatus()
예제 #2
0
    def __init__(self):
        self.logger = get_logger('Core',
                                 True)  # core moudles share the same logger

        self.Scheduler = Scheduler(self.logger)
        self.Downloader = Downloader(self.logger)
        self.Uploader = Uploader(self.logger)
        self.Monitor = Monitor(self.logger)
예제 #3
0
    def __init__(self, logger=None):
        self.logger = logger or get_logger('Downloader', True)

        self.StrategySpiderMap_dict = GlobalConfig.StrategySpiderMap_dict  # GlobalConfig
        self.StrategyTaskQueue_dict = GlobalQueues.StrategyTaskQueue_dict  # GlobalQueues
        self.UploadResults_queue = GlobalQueues.UploadResults_queue
        self.StrategyGroup_dict = GlobalDicts.StrategyGroup_dict  # GlobalDicts

        self.globalize_spiderthreads()  # GlobalDicts
예제 #4
0
    def __init__(self, logger=None):
        self.logger = logger or get_logger(
            'Scheduler', True)  # try to get logger from engine

        self.StrategySpiderMap_dict = GlobalConfig.StrategySpiderMap_dict
        self.SpiderThreads_dict = GlobalDicts.SpiderThreads_dict

        self.globalize_queues()
        self.globalize_strategygroup()
        self.load_strategies()
예제 #5
0
# -*- coding:utf-8 -*-
'''
Created on 2017年6月19日

@author: Thinkpad
'''
import json
import copy
import global_vars.global_config as GlobalConfig
from log.log import get_logger
logger = get_logger('strategy')


class Strategy():
    def __init__(self,
                 StrategyID='TEST',
                 Timeout=0,
                 WaitTime=1,
                 RetryTime=3,
                 AdditionParams={},
                 Encoding='utf-8',
                 FragmentalUpload=False,
                 FragmentalAmount=5,
                 ContentException=[],
                 CookieUse=False):
        self.StrategyID = StrategyID
        self.Encoding = Encoding
        self.Timeout = Timeout
        self.WaitTime = WaitTime
        self.RetryTime = RetryTime
        self.AdditionParams = AdditionParams
예제 #6
0
import requests
from lxml import html
import log.log as log
import random
import traceback
import re
import os

from concurrent.futures import ThreadPoolExecutor as TPE
from concurrent.futures import as_completed

proxyLogger = log.get_logger(__name__)

httpsCount = 0
httpCount = 0


def proxyScraper():
    """
    Parses htmlobject using xpath search pulling IP, port, Country, type, HTTPS and Time discovered
    information for a single request
    :param htmlobject:
    :return: proxyDict
    """
    uri = 'https://free-proxy-list.net/'

    pageContent = requests.get(url=uri, headers=rand_useragent(), timeout=10)

    tree = html.fromstring(pageContent.content)

    proxyIP = [item for item in tree.xpath('//table/tbody/tr/td[1]/text()')]
예제 #7
0
# !/usr/bin/env python3.7

from datetime import datetime as dt

from sqlalchemy.orm import sessionmaker
import log.log as log

from database.db_base import Base, engine, URLs, RapplerURLs

dbLogger = log.get_logger(__name__)

dateFormat = "%Y-%m-%d"

Base.metadata.create_all(engine, checkfirst=True)
DBSession = sessionmaker(bind=engine)
session = DBSession()


def recentRecords(TableName):
    """
    search db table for dynamic column, return max 1000
    'recentResult' is a list
    if no table is present from db, return an empty list
    ###SQL QUERY:
    SELECT composite FROM table
    ORDER BY id DESC
    """

    records = []

    dbRecords = session.query(TableName).order_by(
예제 #8
0
# -*- coding:utf-8 -*-
'''
Created on 2017年6月19日

@author: Thinkpad
'''
import json
import copy
import global_vars.global_dicts as GlobalDicts
import global_vars.global_config as GlobalConfig

from log.log import get_logger
logger = get_logger('Task')


class Task():
    def __init__(self,
                 TaskID=0,
                 StrategyID='TEST',
                 TaskType=0,
                 TaskContent='',
                 TaskStatus=0,
                 AdditionParams={},
                 Encoding=None):
        # TaskType = [0, 1, 2]
        # 0: plain    return {TaskIns_received : {FileName : FileContent, ...}, TaskIns_generated_1 : {}, ...}
        # 1: deep     ..
        # 2: reservation

        # TaskStatus = [0, 1, 2]
        # 0: fail
예제 #9
0
def upload_results_fragment(logger, *args, **kws): # TODO: add a sign in additional params of tasks?
    method_logger = logger or get_logger('server_interact')
    pass
예제 #10
0
def get_proxy(logger, *args, **kw):
    method_logger = logger or get_logger('server_interact')
    pass
예제 #11
0
def get_captcha(logger, *args, **kws):
    method_logger = logger or get_logger('server_interact')
    pass
예제 #12
0
def get_captcha(logger, *args, **kws):
    method_logger = logger or get_logger('server_interact')
    pass

def get_proxy(logger, *args, **kw):
    method_logger = logger or get_logger('server_interact')
    pass






if __name__ == '__main__':
    methods_shared_logger = get_logger('server_interact',True)
  
    get_strategy(methods_shared_logger)
    get_task('TEST_1', 10,methods_shared_logger)
    
    # UploadPack = {0 : ("TaskStatus", "zipfile_md5_task0"), 1 : ("TaskStatus", "zipfile_md5_task1")}
    UploadPack = {0 : ("TaskStatus", "emlwZmlsZV9tZDVfdGFzazA=")}
    ComfirmInfo = upload_results(UploadPack, methods_shared_logger)


 
    from data_structure.status import MachineStatus, StrategyStatus
    MachineStatusIns = MachineStatus()
    StrategyStatusIns = StrategyStatus(MachineStatusIns, 'TEST_1')
    StrategyStatusIns = StrategyStatus(MachineStatusIns, 'TEST_2')
      
예제 #13
0
 def __init__(
     self, StrategyID
 ):  # cause when firstly instantiate the SpiderIns, the StrategyIns is not loaded yet, so pass the StrategyID artificially
     super(Spider_Test, self).__init__()  # load default params
     LogFileName = 'Spider_Test_%s' % str(StrategyID)
     self.logger = get_logger(LogFileName, True)
예제 #14
0
    def __init__(self, logger=None):
        self.logger = logger or get_logger(
            'Uploader', True)  # try to get logger from engine

        self.UploadResults_queue = GlobalQueues.UploadResults_queue
예제 #15
0
# -*- coding:utf-8 -*-
'''
Created on 2017年6月23日

@author: Thinkpad
'''
import json
import base64
import os
import time
from flask import Flask
from flask import request
from log.log import get_logger
logger = get_logger('server_service')

app = Flask(__name__)


def StrategyGenerator():
    # initial strategies
    TEST_1_1_json = '{"RetryTime": 2, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_1", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}'
    TEST_2_1_json = '{"RetryTime": 2, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_2", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}'
    StrategyGroup_1_dict = {'TEST_1': TEST_1_1_json, 'TEST_2': TEST_2_1_json}
    StrategyGroupJson_1 = json.dumps(StrategyGroup_1_dict)

    # new strategies
    TEST_1_2_json = '{"RetryTime": 3, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_1", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}'
    TEST_2_2_json = '{"RetryTime": 3, "WaitTime": 1, "CookieUse":false, "StrategyID": "TEST_2", "AdditionParams": "{\\"rules\\":[\\"rule_1\\", \\"rule_2\\"]}", "Timeout": 3, "Encoding": "utf-8", "FragmentalUpload": false, "FragmentalAmount":5, "ContentException":[]}'
    StrategyGroup_2_dict = {'TEST_1': TEST_1_2_json, 'TEST_2': TEST_2_2_json}
    StrategyGroupJson_2 = json.dumps(StrategyGroup_2_dict)
예제 #16
0
# -*- coding:utf-8 -*-
'''
Created on 2017年6月19日

@author: Thinkpad
'''
import json
from collections import Counter
from utils.decorators import Singleton
from log.log import get_logger
logger = get_logger('status')


@Singleton
class MachineStatus():
    # MachineStatus is instantiated once for a engine startup
    def __init__(self, user='******'):
        self._user = user  # usually IP of vps
        self._MachineStatusCollector_dict = {
        }  # {StrategyID : strategy_status_collector, ...}

    def machine_status_collector(self):
        return self._MachineStatusCollector_dict

    def get_user(self):
        return self._user

    def get_json(
        self
    ):  # when sending pass the _MachineStatusCollector_dict into upload_status directly
        MachineStatusJson = json.dumps(self._MachineStatusCollector_dict,