示例#1
0
 def __init__(self, conf):
     self.conf = conf
     handler = TimedRotatingFileHandler(conf.log_file, date_format="%Y-%m-%d")
     handler.push_application()
     self.logger = Logger("Firetower-server")
     self.queue = redis_util.get_redis_conn(host=conf.redis_host, port=conf.redis_port, redis_db=conf.redis_db)
     self.classifier = classifier.Levenshtein()
     self.last_archive = None
示例#2
0
 def __init__(self, conf):
     self.conf = config.Config(conf)
     self.redis_host = self.conf.redis_host
     self.redis_port = self.conf.redis_port
     self.redis_db = self.conf.redis_db
     self.queue_key = self.conf.queue_key
     self.queue = redis_util.get_redis_conn(
             host=self.redis_host,
             port=self.redis_port,
             redis_db=self.redis_db)
 def __init__(self,sheet_name,settings=SETTINGS):
     self.mongo_db = MongodbClass()
     self.mysql_conn = MySqlDBClass(settings)
     self.sheet_name = sheet_name
     self.primary_keys = []
     self.mongo_batch_size = 400
     self.mysql_batch_size = 200
     self.redis_conn = get_redis_conn()
     self.def_logger = update_logging()
     self.file_path = 'C:\\Program Files (x86)\\crawling_server\\wangban_utils'
示例#4
0
 def __init__(self, conf):
     """conf: dict, yaml parameters."""
     self.conf = conf
     handler = TimedRotatingFileHandler(
         conf.log_file, date_format='%Y-%m-%d')
     handler.push_application()
     self.logger = Logger('Firetower-admin')
     self.queue = redis_util.get_redis_conn(
         host=conf.redis_host, port=conf.redis_port, redis_db=conf.redis_db
     )
     self.classifier = classifier.Levenshtein()
示例#5
0
 def __init__(self, conf):
     self.conf = conf
     handler = TimedRotatingFileHandler(
         conf.log_file, date_format='%Y-%m-%d')
     handler.push_application()
     self.logger = Logger('Firetower-server')
     self.queue = redis_util.get_redis_conn(
         host=conf.redis_host, port=conf.redis_port, redis_db=conf.redis_db
     )
     self.classifiers = []
     for classifier_name in conf.class_order:
         self.classifiers.append(getattr(classifier, classifier_name)())
     self.last_archive = None
示例#6
0
        #print(detect_num)
        return detect_num

    def delete_from_db(self, _id, sheet_name):
        try:
            self.mongo_db[sheet_name].remove({'_id': _id})
            print('delete successfully')
        except Exception as e:
            print('delete from mongo error', e)


if __name__ == '__main__':
    from redis_util import get_redis_conn
    import json
    mongo_instance = MongodbClass()
    redis_conn = get_redis_conn()
    #mongo_instance.get_from_db('zhuji',return_field='_id')
    #select_conditions = {'an_major':'其他交易'}
    ##select_conditions = {'an_major':'工程建设','an_type':"招标公告"}
    ##mongo_instance.detect_from_db('beilun',select_conditions)
    cleaner = Cleaner(page_structure=False,
                      links=False,
                      style=True,
                      scripts=True)
    for data in mongo_instance.get_all_from_db('linan_clean'):
        print(data['ID'])
        #if not data['PUBDATE'].startswith('2019'):
        #    continue

        input_value = {}
        input_value[data['LINK']] = {}
示例#7
0
 def __init__(self,sheet_name):
     self.mongo_conn = MongodbClass()
     self.redis_conn = get_redis_conn()
     self.sheet_name = sheet_name