Exemplos de ScalableBloomFilter.add em Python, exemplos de pybloom_live.ScalableBloomFilter.add em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: sh_related_internet_medicine_msg_server.py Projeto: yuanhai11/spider

def get_updated():
    db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306)
    cursor = db.cursor()
    sql = "select record_id from spider_sh_related_internet_medicine_msg_server"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    # print(db_data)
    # exit()
    data = [i[0] for i in db_data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=10000,error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    return bloom

Exemplo n.º 2

0

Exibir arquivo

def blood_filter_pickle(sum):

    sql = "select * from qcc_source_data"
    cursor.execute(sql)
    data = cursor.fetchall()
    if len(data) == 0:
        return sum
    data = [i[2] for i in data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    sum = [i for i in sum if i['园区id'] not in bloom]
    return sum

Exemplo n.º 3

0

Exibir arquivo

def updated():
    db = pymysql.connect(host="192.168.2.97",
                         user="******",
                         password='******',
                         database="spider",
                         port=3306)
    cursor = db.cursor()
    sql = "select url from spider_2_company_revoke"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    data = [i[0].strip() for i in db_data]
    bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001)
    for i in data:
        bloom.add(i)
    return bloom

Exemplo n.º 4

0

Exibir arquivo

class Spider_related(threading.Thread,Downloader):
    def __init__(self,keyList_queue,writer,contain):
        super(Spider_related, self).__init__()
        self.keyList_queue = keyList_queue
        self.writer = writer
        self.contain = contain

        # 可自动扩容的布隆过滤器
        self.bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001)


    def run(self):
        while True:
            key = self.keyList_queue.get()

            # 过滤重复词,如果它存在seo中就跳过
            if key in self.bloom:
                self.keyList_queue.task_done()
                continue

            # 依据关键词下载当页的源码
            source = self.download(key)
            self.bloom.add(key)    #每采一个就添加一个，用来过滤重复

            # 如果返回的源码为 None则跳过
            if source is None:
                # task_done() 的作用：只有消费者把队列所有的数据处理完毕，queue.join()才会停止阻塞
                self.keyList_queue.task_done()
                continue

            # 解析源码中的新关键词
            self.parse_keyList(source)
            self.writer.flush() #每执行一次刷新一次文件记录
            self.keyList_queue.task_done()

    def parse_keyList(self,source):
        ele = etree.HTML(source)
        keyList = ele.xpath('//table//tr//th/a/text()')
        for key in keyList:

            for con in self.contain:
                if con in key:
                    if key in self.bloom:
                        return
                    else:
                        self.writer.write('{}\n'.format(key))
                        self.keyList_queue.put(key)
                        print('新词：{}'.format(key))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: spider_mechine_zj.py Projeto: yuanhai11/spider

def get_updated():
    db = pymysql.connect(host="192.168.2.97",
                         user="******",
                         password='******',
                         database="spider",
                         port=3306)
    cursor = db.cursor()
    sql = "select license_num from spider_qualification"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    data = [i[0] for i in db_data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=1000000, error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    return bloom

Exemplo n.º 6

0

Exibir arquivo

Arquivo: fygg.py Projeto: ShawnLoveGame/crawler

 def get_bloomFilter(self, sql):
     bloom = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     db = pymysql.connect("localhost", "root", "123456", "mytest", charset='utf8')
     cursor = db.cursor()
     cursor.execute(sql)
     desc = cursor.description
     object_dict = [
         dict(zip([col[0] for col in desc], row))
         for row in cursor.fetchall()
     ]
     # print(object_dict)
     cursor.close()
     for d in object_dict:
         # print(d)
         bloom.add(d)
     return bloom

Exemplo n.º 7

0

Exibir arquivo

Arquivo: sh_company_medical_equipment_network_third_party_platform.py Projeto: yuanhai11/spider

def get_updated():
    db = pymysql.connect(host="192.168.2.99",
                         user="******",
                         password='******',
                         database="spider",
                         port=3306)
    cursor = db.cursor()
    sql = "select record_num from spider_sh_company_medical_equipment_network_third_party_platform"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    data = [i[0] for i in db_data]
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    return bloom

Exemplo n.º 8

0

Exibir arquivo

class RemoveSameUrlPipLine:
    def __init__(self):
        redis_db = redis.Redis(host='127.0.0.1',
                               port=6379,
                               db=0,
                               decode_responses=True)
        result = redis_db.smembers('spider:url')
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        for item in result:
            self.sbf.add(item)

    def process_item(self, item, spider):
        if item['link'] in self.sbf:
            raise DropItem("same title in %s" % item['link'])
        else:
            return item

Exemplo n.º 9

0

Exibir arquivo

class IgnoreRequestMiddleware(object):
    """
        url 请求去重
    """
    def __init__(self):
        # 可自动伸缩的布隆过滤器
        self.sbf = ScalableBloomFilter(initial_capacity=100, error_rate=0.001)

    def process_request(self, request, spider):
        if not request.url:
            return None
        url_hash = hashlib.md5(request.url.encode("utf8")).hexdigest()
        if url_hash in self.sbf:
            raise IgnoreRequest("Spider : %s, IgnoreRequest : %s" %
                                (spider.name, request.url))
        else:
            self.sbf.add(url_hash)

Exemplo n.º 10

0

Exibir arquivo

def get_bloom():
    db = pymysql.connect(host="192.168.2.97",
                         user="******",
                         password='******',
                         database="spider",
                         port=3306)
    cursor = db.cursor()
    sql = "select company_num from company_title"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    data = [i[0] for i in db_data]
    print(data)
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    db.close()
    return bloom

Exemplo n.º 11

0

Exibir arquivo

def main():
    '''
    有icp edi 域名备案
    :return:
    '''
    db = pymysql.connect(host="192.168.2.99",
                         user="******",
                         password='******',
                         database="spider",
                         port=3306)
    cursor = db.cursor()
    sql = 'select distinct (company_name) from spider_company_icp;'
    cursor.execute(sql)
    domain_data = cursor.fetchall()
    domain_data = [i[0] for i in domain_data]
    # print(domain_data)
    print('icp 域名备案 ---- count：{}'.format(len(domain_data)))
    '''
    没有icp edi 许可证
    :return:
    '''
    add_value_telecom_lists = model1_filter_data(
        'spider_add_value_telecom_info_hz')
    industry_lists = model1_filter_data('spider_industry_information')
    industry_gov_lists = model1_filter_data('spider_industry_information_gov')

    p = add_value_telecom_lists + industry_lists + industry_gov_lists
    bloom = ScalableBloomFilter(initial_capacity=1000000, error_rate=0.001)
    for i in p:
        bloom.add(i)
    sum = []
    for company in domain_data:
        if company not in bloom:
            sum.append(company)
        else:
            print('具有ICp许可证', company)
    print('icp 域名备案 ,没有icp许可证的公司 ----- count:{}'.format(len(sum)))
    print(sum)
    for i in sum:
        medicine = Medicine(company_name=i)
        session.add(medicine)

    session.commit()
    session.close()

Exemplo n.º 12

0

Exibir arquivo

def generate_mul_col_bloom(conf, capacity, cursor):
    """
    根据配置文件，初始容量，放入bloom过滤器的数据为联合外键分析生成bloom过滤器
    :param conf: 配置信息
    :param capacity: bloom过滤器初始容量
    :param cursor: 联合主键的值
    :return: bloom过滤器对象
    """
    assert isinstance(conf, Config)
    b = ScalableBloomFilter(initial_capacity=capacity,
                            error_rate=conf.bloom_error_rate)
    while True:
        row = cursor.fetchone()
        if not row:
            break
        # 核心算法：遍历联合主键的每一行，将值生成一个frozenset，对frozenset取hash，将hash值放入bloom过滤器
        hash_elem = get_md5([str(elem).rstrip() for elem in row])
        b.add(hash_elem)
    return b

Exemplo n.º 13

0

Exibir arquivo

def get_updated():
    db = pymysql.connect(host="192.168.2.99",
                         user="******",
                         password='******',
                         database="spider",
                         port=3306)
    cursor = db.cursor()
    sql = "select permit_number from spider_add_value_telecom_info"
    cursor.execute(sql)
    db_data = cursor.fetchall()
    # print(db_data)
    # exit()
    data = [i[0].strip() for i in db_data]
    # print(data)
    # exit()
    from pybloom_live import ScalableBloomFilter
    bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001)
    for bl in data:
        bloom.add(bl)
    return bloom

Exemplo n.º 14

0

Exibir arquivo

 def collection_list_field(list, data, main_field):
     if list:
         bloom = ScalableBloomFilter(
             mode=ScalableBloomFilter.SMALL_SET_GROWTH)
         for l in list:
             bloom.add(l)
         if data not in bloom:
             for l in list:
                 if l[main_field] == data[main_field]:
                     list.remove(l)
                     item = {}
                     for k, v in l.items():
                         item[k] = databaseTool.collection_field(
                             databaseTool.is_field_dict(l, k),
                             databaseTool.is_field_dict(data, k))
                     list.append(item)
                 else:
                     list.append(data)
     else:
         list.append(data)
     return list

Exemplo n.º 15

0

Exibir arquivo

Arquivo: ConOfAllData.py Projeto: cctw-zed/crawler000

class ConOfAllData(object):
    def __init__(self, site_name):
        self.client = MongoClient('localhost', 27017)
        self.db = self.client.crawlSpider
        self.col_url = self.db[site_name + "_url"]
        self.col_content = self.db[site_name + "_content"]
        self.sbf = ScalableBloomFilter(initial_capacity=100)
        for item in self.col_url.find():
            self.sbf.add(item["url"])
        self.insert_url = []
        self.insert_content = []

    def isexist(self, url):
        if url in self.sbf:
            return True
        else:
            self.sbf.add(url)
            self.insert_url.append({"url": url})
            return False

    def insert(self, content):
        if content['real_url'] is not None and content[
                'title'] is not None and content['abstract'] is not None:
            self.insert_content.append(content)

    def end(self):
        if len(self.insert_url) != 0:
            self.col_url.insert_many(self.insert_url)
        if len(self.insert_content) != 0:
            self.col_content.insert_many(self.insert_content)


#if __name__ == "__main__":
#coad = ConOfAllData("ningxia")
#coad.isexist("1")
#coad.isexist('2')
#coad.isexist("1")
#coad.insert({"site": "ningxia"})
#coad.insert({"site": "guangdong"})
#coad.end()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: new_process_data.py Projeto: Jsonming/workscript

 def delete_duplicate_data(self, file):
     """
     去重函数
     :param file:
     :return:
     """
     bloom = ScalableBloomFilter(initial_capacity=100,
                                 error_rate=0.00000001)
     temp_name = file.replace(".txt", "_temp.txt")
     with open(file, 'r',
               encoding='utf8') as r_f, open(temp_name,
                                             'a',
                                             encoding='utf8') as w_f:
         for line in r_f:
             line_content = line.strip()
             if line_content not in bloom:
                 bloom.add(line_content)
                 w_f.write(line_content + "\n")
             else:
                 print(line_content)
     os.remove(file)
     os.rename(temp_name, file)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: pipelines.py Projeto: Flat-Chen/Unicdata_Chen

class GanjiPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def __init__(self, settings):
        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )
        # mongo
        # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/'
        # self.connection = pymongo.MongoClient(uri)
        # self.connection = pymongo.MongoClient(
        #     settings['MONGODB_SERVER'],
        #     settings['MONGODB_PORT']
        # )
        # db = self.connection[settings['MONGODB_DB']]
        # self.collection = db[settings['MONGODB_COLLECTION']]
        # # count
        self.mongocounts = 0
        self.counts = 0
        self.CrawlCar_Num = 1000000
        self.settings = settings
        self.add_num = 0
        self.drop_num = 0

        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']

        self.df_result = pd.DataFrame()

        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)
        # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01)
        # # read
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)

    def open_spider(self, spider):
        pass

    def process_item(self, item, spider):

        if spider.name in [
                'ganji', 'crawl_jingzhengu', 'xiaozhu', 'hry2car', 'che168',
                'youxin', 'chesupai', 'youxin_master', 'auto51',
                'autohome_butie'
        ]:
            valid = True
            i = md5(item['status'].encode("utf8")).hexdigest()
            returndf = self.df.add(i)

            field_list = [
                "carsource", "grab_time", "price1", "mileage", "post_time",
                "sold_date", "city", "registerdate"
            ]
            data = dict()
            for field in field_list:
                data[field] = item[field] if field in item else None

            if returndf:
                self.drop_num += 1
                valid = False
            else:
                pass
            if valid:
                self.fa.writelines(i + '\n')
                # 数据存入mysql
                items = list()
                items.append(item)
                df = pd.DataFrame(items)
                if spider.name in [
                        'test',
                ]:
                    self.df_result = pd.concat([self.df_result, df])
                    self.mongocounts += 1
                    logging.log(
                        msg=
                        f"add              {self.mongocounts}              items",
                        level=logging.INFO)
                else:
                    df.to_sql(name=self.settings['MYSQL_TABLE'],
                              con=self.conn,
                              if_exists="append",
                              index=False)
                    self.mongocounts += 1
                    logging.log(
                        msg=
                        f"scrapy              {self.mongocounts}              items",
                        level=logging.INFO)

    def close_spider(self, spider):
        # self.connection.close()
        logging.log(
            msg=f"drop              {self.drop_num}              items",
            level=logging.INFO)
        if spider.name in ['test']:
            self.df_result.to_sql(name=self.settings['MYSQL_TABLE'],
                                  con=self.conn,
                                  if_exists="append",
                                  index=False)
        self.conn.dispose()

Exemplo n.º 18

0

Exibir arquivo

Arquivo: Crawler.py Projeto: UTMediaCAT/Voyage

class Crawler(object):
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
        ignore_filter_dir='../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
            try:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+')
            	f.write(self.ignore_filter)
            except IOError:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
            f.close()
        else:
            if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))):
                f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir='../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
        self.probabilistic_n = common.get_config()["crawler"]["n"]
        self.probabilistic_k = common.get_config()["crawler"]["k"]

        self.db = psycopg2.connect(host='localhost',
                                   database=common.get_config()["crawler"]["postgresql"]["name"],
                                   user=common.get_config()["crawler"]["postgresql"]["user"],
                                   password=common.get_config()["crawler"]["postgresql"]["password"])

        self.cursor = self.db.cursor()
        self.already_added_urls = set()
        self.visited_table = "visited_" + str(site.id)
        self.tovisit_table = "tovisit_" + str(site.id)

        #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table)
        #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)")
        self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table)
        self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))")

        #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,))
        self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,))

        self.db.commit()
        """

    def __iter__(self):
        return self

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file:
            try:
                current_level = 0
                while(True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                        initial_capacity=10000000,
                        error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError


                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(u"skipping url \"{0}\" because it matches filter".format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"):
                                logging.info(u"skipping url with invalid scheme: {0}".format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                            continue
                        if(not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")):
                        	continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put((url, str(int(current_level) + 1)))
                            logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()


                    return article


            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

    def url_in_filter(self, url, filters):
        """
        Checks if any of the filters matches the url.
        Filters can be in regex search or normal string comparison.
        """
        for filt in filters:
            if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or
                (not filt.regex and filt.pattern in url)):
                return True
        return False

Exemplo n.º 19

0

Exibir arquivo

class GanjiPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def __init__(self, settings):
        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )
        # mongo
        # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/'
        # self.connection = pymongo.MongoClient(uri)
        # self.connection = pymongo.MongoClient(
        #     settings['MONGODB_SERVER'],
        #     settings['MONGODB_PORT']
        # )
        # db = self.connection[settings['MONGODB_DB']]
        # self.collection = db[settings['MONGODB_COLLECTION']]
        # # count
        self.mongocounts = 0
        self.counts = 0
        self.CrawlCar_Num = 1000000
        self.settings = settings
        self.add_num = 0
        self.drop_num = 0

        self.log_dict = {
            "projectName": "used-car-scrapy",
            "logProgram": '',
            "logProgramPath": str(pathlib.Path.cwd()),
            "logPath": "/home/logs/usedcar_new",
            "logTime": '',
            "logMessage": "",
            "logServer": "192.168.1.241",
            "logObjectType": "UsedCarPaChong",
            "logObject": {
                "field": '',
                "info": {
                    "dataBaseType": 'mysql',
                    "dataBaseName": '',
                    "tableName": '',
                    "saveStatus": ''
                }
            }
        }

        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']

        self.df_result = pd.DataFrame()

        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)
        # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01)
        # # read
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)

    def open_spider(self, spider):
        pass

    def process_item(self, item, spider):
        if spider.name in ['chesupai']:
            self.log_dict["logServer"] = "192.168.1.241"
        if spider.name in ['anxinpai']:
            self.log_dict["logServer"] = "192.168.1.249"
        if spider.name in ['crawl_jingzhengu', 'ganji']:
            self.log_dict["logServer"] = "192.168.1.248"
        if spider.name in ['xiaozhu', 'hry2car']:
            self.log_dict["logServer"] = "192.168.1.92"

        if spider.name in [
                'ganji', 'crawl_jingzhengu', 'xiaozhu', 'hry2car', 'che168',
                'youxin', 'chesupai', 'youxin_master', 'auto51'
        ]:
            valid = True
            i = md5(item['statusplus'].encode("utf8")).hexdigest()
            returndf = self.df.add(i)

            self.log_dict["logProgram"] = spider.name
            self.log_dict["logTime"] = item["grab_time"]
            self.log_dict["logType"] = 'INFO'
            self.log_dict["logMessage"] = "successful"
            field_list = [
                "carsource", "grab_time", "price1", "mileage", "post_time",
                "sold_date", "city", "registerdate"
            ]
            data = dict()
            for field in field_list:
                data[field] = item[field] if field in item else None
            self.log_dict["logObject"]["field"] = data
            self.log_dict["logObject"]["field"]["carsource"] = item[
                "car_source"]
            self.log_dict["logObject"]["info"][
                "dataBaseName"] = "usedcar_update"
            self.log_dict["logObject"]["info"][
                "tableName"] = spider.name + '_online'

            if returndf:
                self.drop_num += 1
                valid = False
                # self.log_dict["logObject"]["info"]["saveStatus"] = "false"
                self.log_dict["logObject"]["info"]["saveStatus"] = "true"
                logging.log(msg=json.dumps(self.log_dict, ensure_ascii=False),
                            level=logging.INFO)
                # raise DropItem("Drop data {0}!".format(item["url"]))
            else:
                pass
            if valid:
                self.fa.writelines(i + '\n')
                # 数据存入mysql
                items = list()
                items.append(item)
                df = pd.DataFrame(items)
                if spider.name in [
                        'test',
                ]:
                    self.df_result = pd.concat([self.df_result, df])
                    self.mongocounts += 1
                    logging.log(
                        msg=
                        f"add              {self.mongocounts}              items",
                        level=logging.INFO)
                else:
                    # self.log_dict["logObject"]["info"]["saveStatus"] = "true"
                    self.log_dict["logObject"]["info"]["saveStatus"] = "false"
                    logging.log(msg=json.dumps(self.log_dict,
                                               ensure_ascii=False),
                                level=logging.INFO)
                    df.to_sql(name=self.settings['MYSQL_TABLE'],
                              con=self.conn,
                              if_exists="append",
                              index=False)
                    self.mongocounts += 1
                    logging.log(
                        msg=
                        f"scrapy              {self.mongocounts}              items",
                        level=logging.INFO)

    def close_spider(self, spider):
        # self.connection.close()
        logging.log(
            msg=f"drop              {self.drop_num}              items",
            level=logging.INFO)
        if spider.name in ['test']:
            self.df_result.to_sql(name=self.settings['MYSQL_TABLE'],
                                  con=self.conn,
                                  if_exists="append",
                                  index=False)
        self.conn.dispose()

Exemplo n.º 20

0

Exibir arquivo

Arquivo: Crawler.py Projeto: danhuacai/Voyage

class Crawler(object):
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000,
                                                 error_rate=0.00001)
        ignore_filter_dir = '../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000,
                                                     error_rate=0.00001)
            try:
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'r+')
                f.write(self.ignore_filter)
            except IOError:
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'w+')
            f.close()
        else:
            if (not (os.path.exists('../ignore_filter/' + self.site.name +
                                    '_ignore_file.txt'))):
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name +
                      '_ignore_file.txt',
                      'r+',
                      buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir = '../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
        self.probabilistic_n = common.get_config()["crawler"]["n"]
        self.probabilistic_k = common.get_config()["crawler"]["k"]

        self.db = psycopg2.connect(host='localhost',
                                   database=common.get_config()["crawler"]["postgresql"]["name"],
                                   user=common.get_config()["crawler"]["postgresql"]["user"],
                                   password=common.get_config()["crawler"]["postgresql"]["password"])

        self.cursor = self.db.cursor()
        self.already_added_urls = set()
        self.visited_table = "visited_" + str(site.id)
        self.tovisit_table = "tovisit_" + str(site.id)

        #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table)
        #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)")
        self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table)
        self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))")

        #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,))
        self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,))

        self.db.commit()
        """

    def __iter__(self):
        return self

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt',
                  'a') as ignore_filter_file:
            try:
                current_level = 0
                while (True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(
                            self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(
                                current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True  # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                            initial_capacity=10000000, error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name +
                                  '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError

                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(
                                u"skipping url \"{0}\" because it matches filter"
                                .format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if (parsed_url.scheme != u"http"
                                    and parsed_url.scheme != u"https"):
                                logging.info(
                                    u"skipping url with invalid scheme: {0}".
                                    format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(
                                urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(
                                u"skipping malformed url {0}. Error: {1}".
                                format(url, str(e)))
                            continue
                        if (not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and
                                not (u"-subscribe" in url or "-subscribe"
                                     or u"subscribe-" in url or "subscribe-")):
                            continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put(
                                (url, str(int(current_level) + 1)))
                            logging.info(
                                u"added {0} to the to_visit as well as the level {1}"
                                .format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(
                                u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()

                    return article

            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

    def url_in_filter(self, url, filters):
        """
        Checks if any of the filters matches the url.
        Filters can be in regex search or normal string comparison.
        """
        for filt in filters:
            if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE))
                    or (not filt.regex and filt.pattern in url)):
                return True
        return False

Exemplo n.º 21

0

Exibir arquivo

class AutohomeNewPipeline:
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def __init__(self, settings):
        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )

        # mongo
        self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                              settings['MONGODB_PORT'])
        db = self.connection[settings['MONGODB_DB']]
        website = settings["WEBSITE"]
        self.collection = db[settings['MONGODB_COLLECTION']]
        # count
        self.mysqlcounts = 0
        self.counts = 0

        self.settings = settings
        # bloom file
        self.CrawlCar_Num = 1000000
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MONGODB_DB'] + '/' + settings['MONGODB_COLLECTION'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB']
        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
        self.counts = 0

    def process_item(self, item, spider):
        # mongo要有重字段status的爬虫名字写进去
        if spider.name in ["autohome_dealer", " "]:
            valid = True
            i = md5(item['status'].encode("utf8")).hexdigest()
            returndf = self.df.add(i)
            if returndf:
                valid = False
                raise DropItem("Drop data {0}!".format(item["status"]))
            else:
                self.fa.writelines(i + '\n')
                self.collection.insert(dict(item))
                logging.log(msg="Car added to MongoDB database!",
                            level=logging.INFO)
                self.counts += 1
                logging.log(msg="scrapy                    " +
                            str(self.counts) + "                  items",
                            level=logging.INFO)
                return item
        # mongo不需要去重的爬虫名字写进去
        elif spider.name in ["", " "]:
            self.collection.insert(dict(item))
            logging.log(msg="Car added to MongoDB database!",
                        level=logging.INFO)
            self.counts += 1
            logging.log(msg="scrapy                    " + str(self.counts) +
                        "                  items",
                        level=logging.INFO)
            return item
        # mysql有要去重字段status的爬虫名字写进去
        elif spider.name in ['', ' ']:
            valid = True
            i = md5(item['status'].encode("utf8")).hexdigest()
            returndf = self.df.add(i)
            if returndf:
                valid = False
                raise DropItem("Drop data {0}!".format(item["status"]))
            else:
                self.fa.flush()
                self.fa.writelines(i + '\n')
                self.mysqlcounts += 1
                logging.log(
                    msg=
                    f"scrapy              {self.mysqlcounts}              items",
                    level=logging.INFO)
                # 数据存入mysql
                items = list()
                items.append(item)
                df = pd.DataFrame(items)
                df.to_sql(name=self.settings['MYSQL_TABLE'],
                          con=self.conn,
                          if_exists="append",
                          index=False)
                logging.log(msg=f"add data in mysql", level=logging.INFO)
                return item
        # mysql不需要去重的爬虫名字写进去
        elif spider.name in ['baidu', '']:
            self.mysqlcounts += 1
            logging.log(
                msg=
                f"scrapy              {self.mysqlcounts}              items",
                level=logging.INFO)
            # 数据存入mysql
            items = list()
            items.append(item)
            df = pd.DataFrame(items)
            df.to_sql(name=self.settings['MYSQL_TABLE'],
                      con=self.conn,
                      if_exists="append",
                      index=False)
            logging.log(msg=f"add data in mysql", level=logging.INFO)
            return item

    def close_spider(self, spider):
        self.connection.close()
        self.fa.close()

Exemplo n.º 22

0

Exibir arquivo

class WeiboCnSpider:
    def __init__(self, tasks=2, loop=None):
        self.tasks = tasks
        self.loop = loop or asyncio.get_event_loop()
        self.redis_cookie = RedisCookie()
        self.redis_cookie_now = src.redis_cookies.RedisCookies()
        self.redis_job = RedisJob()
        self.bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.weibo_limit = True
        self.time_current_pattern = re.compile(r'(\d*)分钟前')
        self.time_today_pattern = re.compile(r'今天\s*(\d*):(\d*)')
        self.time_year_pattern = re.compile(r'(\d*)月(\d*)日\s*(\d*):(\d*)')
        self.user_id_pattern = re.compile(r'https://weibo.cn/u/(\d*)')
        self.weibo_host = 'https://weibo.cn'
        self.follow_url = self.weibo_host + '/%s/follow'
        self.redis_job_now = src.redis_cookies.RedisJob()
        self.fan_url = self.weibo_host + '/%s/fans'
        self.user_info_url = self.weibo_host + '/%s/info'
        self.user_tweet_url = self.weibo_host + '/%s'
        self.user_tweet_url2 = self.weibo_host + '/%s?page=%d'
        self.user_repost_url = self.weibo_host + '/repost/%s'
        self.user_repost_url2 = self.weibo_host + '/repost/%s?page=%d'
        self.tweet_comment_url = self.weibo_host + '/comment/%s'
        self.tweet_comment_url2 = self.weibo_host + '/comment/%s?page=%d'
        self.weibo_producer = WeiboProcuder(['localhost:9092'], 'lyzzrO_U')
        self.search_url = 'https://weibo.cn/search/?pos=search'
        self.get_search_url = 'https://weibo.cn/search/mblog/?keyword=%s&filter=hasori'
        self.topic_max_page = 500
        self.super_new_fan_max_page = 500
        self.topic_url = 'https://m.weibo.cn/api/container/getIndex?containerid=1008086a8d30da45d91717a596df498af918aa_-_feed&page=%d'
        self.super_new_fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=2311406a8d30da45d91717a596df498af918aa_-_super_newfans&page=%d'

    async def crawl_follow(self):
        while True:
            follow_dict = self.redis_job_now.fetch_job(JobType.follower.value)
            if follow_dict:
                try:
                    await self.grab_follow(follow_dict)
                    LOGGER.info('finish %d follow crawl ' % follow_dict['uid'])
                except TimeoutError as e:
                    print(e)
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def grab_follow(self, follow_dict):
        LOGGER.info('start grab user follow: %s' % str(follow_dict))
        html_content = await self.grab_html(follow_dict['url'])
        follow_html = BeautifulSoup(html_content, "lxml")
        all_td = follow_html.find_all('td', style=True)
        follow_id=[]
        for td in all_td:
            a = td.find('a').get('href')
            usr_id_result = self.user_id_pattern.findall(a)
            if usr_id_result:
                usr_id = usr_id_result[0]
            else:
                usr_id = await self.get_user_id_from_homepage(a)
            if usr_id not in follow_id:
                follow_id.append(int(usr_id))
        user_follow_dict={}
        follow_id_key_list = [i for i in range(len(follow_id))]
        follow_id = dict(zip(follow_id_key_list, follow_id))
        user_follow_dict['type'] = 'follow'
        user_follow_dict['uid'] = follow_dict['uid']
        user_follow_dict['fans_id'] = follow_id
        await self.weibo_producer.send(user_follow_dict, self.follow_url % follow_dict['uid'])
        if 'page=' not in follow_dict['url']:
            page_div = follow_html.find(id='pagelist')
            if page_div:
                max_page = int(page_div.input.get('value'))
                if max_page>20:
                    max_page=20
                for page in range(2, max_page + 1):
                    await self.redis_job.push_job(JobType.follower.value,
                                                  {'url': (self.follow_url % follow_dict['uid']) + '?page=' + str(page),
                                                   'uid': follow_dict['uid']})

    async def crawl_fan(self):
        while True:
            fan_dict = self.redis_job_now.fetch_job(JobType.fan.value)
            if fan_dict:
                try:
                    await self.grab_fan(fan_dict)
                    LOGGER.info('finish %d fan crawl ' % fan_dict['uid'])
                except TimeoutError as e:
                    print(e)
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def grab_fan(self, fan_dict):
        LOGGER.info('start grab user fan: %s' % str(fan_dict))
        html_content = await self.grab_html(fan_dict['url'])
        fan_html = BeautifulSoup(html_content, "lxml")
        all_td = fan_html.find_all('td', style=True)
        fans_id=[]
        for td in all_td:
            a = td.find('a').get('href')
            usr_id_result = self.user_id_pattern.findall(a)
            if usr_id_result:
                usr_id = usr_id_result[0]
            else:
                usr_id = await self.get_user_id_from_homepage(a)
            if usr_id not in fans_id:
                fans_id.append(int(usr_id))
        user_fan_dict={}
        fans_id_key_list = [i for i in range(len(fans_id))]
        fans_id = dict(zip(fans_id_key_list,fans_id))
        user_fan_dict['type'] = 'fan'
        user_fan_dict['uid'] = fan_dict['uid']
        user_fan_dict['fans_id'] = fans_id
        await self.weibo_producer.send(user_fan_dict, self.fan_url % fan_dict['uid'])
        if 'page=' not in fan_dict['url']:
            page_div = fan_html.find(id='pagelist')
            if page_div:
                max_page = int(page_div.input.get('value'))
                if max_page>20:
                    max_page=20
                for page in range(2, max_page + 1):
                    await self.redis_job.push_job(JobType.fan.value,
                                                  {'url': (self.fan_url % fan_dict['uid']) + '?page=' + str(page),
                                                   'uid': fan_dict['uid']})

    async def crawl_comment(self):
        while True:
            comment_job_info = await self.redis_job.fetch_job(JobType.comment.value)
            if comment_job_info:
                try:
                    # asyncio.run_coroutine_threadsafe(self.grab_tweet_comments(comment_job_info), self.loop)
                    await self.grab_tweet_comments(comment_job_info)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error("something error")
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def crawl_repost(self):
        while True:
            repost_job_info = await self.redis_job.fetch_job(JobType.repost.value)
            if repost_job_info:
                try:
                    await self.grab_tweet_repost(repost_job_info)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error("something error")
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def crawl_weibo(self):
        r = re.compile(r'https://weibo.cn/(\d*)\?page=(\d*)')
        while True:
            tweet_job_info = await self.redis_job.fetch_job(JobType.tweet.value)
            if tweet_job_info:
                m = r.findall(tweet_job_info['url'])
                if m:
                    page_no = int(m[0][1])
                    if page_no > 200:
                        LOGGER.info('job passed %s' % str(tweet_job_info))
                        continue
                # if 'page=' in tweet_job_info['url']:
                #     LOGGER.info('job passed %s' % str(tweet_job_info))
                #     continue

                try:
                    await self.grab_user_tweet(tweet_job_info)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def search(self):
        while True:
            search_job_info = await self.redis_job.fetch_job(JobType.search.value)
            if search_job_info:
                try:
                    await self.search_tweet(search_job_info)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def crawl_user(self):
        while True:
            user_job_info = self.redis_job_now.fetch_job(JobType.user.value)
            if user_job_info:
                try:
                    if 'source' in user_job_info:
                        await self.grab_user_info(user_job_info['user_id'], user_job_info['source'])
                    else :
                        await self.grab_user_info(user_job_info['user_id'], user_job_info['source'])
                    # await self.redis_job.push_job(JobType.tweet.value,
                    #                               {'url': 'https://weibo.cn/' + user_job_info['user_id'],
                    #                                'uid': user_job_info['user_id']})

                    await self.redis_job.push_job(JobType.follower.value,
                                                  {'url': self.follow_url % user_job_info['user_id'],
                                                   'uid': user_job_info['user_id']})

                    await self.redis_job.push_job(JobType.fan.value,
                                                  {'url': self.fan_url % user_job_info['user_id'],
                                                   'uid': user_job_info['user_id']})
                    # self.weibo_queue.put({'url': self.user_tweet_url % user_id, 'uid': user_id})
                    # self.follow_queue.put({'uid': user_id, 'url': self.follow_url % user_id})
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def search_tweet(self, search_job_info):
        html_content = await self.grab_html(search_job_info['url'])

        result_html = BeautifulSoup(html_content, "lxml")
        if 'page' not in search_job_info['url']:
            total_count_str = result_html.find(text=re.compile(r'共\d*条'))
            print(total_count_str)
            total_count_result = re.findall(r'共(\d*)条', total_count_str)
            if total_count_result:
                total_count = total_count_result[0]
                total_page = int(total_count) / 10
                for page_no in range(2, int(total_page)):
                    await self.redis_job.push_job(JobType.search.value, {
                        'url': search_job_info['url'] + '&page=' + str(page_no)
                    })

        tweet_divs = result_html.find_all(id=True, class_='c')
        for tweet_div in tweet_divs:
            tweet = {}
            nk_div = tweet_div.find('a', class_='nk')
            if nk_div:
                nk_url = nk_div.get('href')
                usr_id_result = self.user_id_pattern.findall(nk_url)
                if usr_id_result:
                    usr_id = usr_id_result[0]
                else:
                    usr_id = await self.get_user_id_from_homepage(nk_url)
            else:
                usr_id = 'unknown'
            if tweet_div.find(class_='cmt', string='转发理由:'):  # 转发
                tweet['flag'] = '转发'
                parent = tweet_div.find(class_='cmt', string='转发理由:').parent
                try:
                    comment_href = tweet_div.find_all('div')[-2].find('a', class_='cc').get('href')

                    href = comment_href.split('?')[0]
                    tweet['sourceTid'] = href.split('/')[-1]

                except Exception:
                    pass
                text = parent.get_text()
                fields = text.split('\xa0')

                content = fields[0][5:]
                ct_content = parent.find('span', class_='ct').get_text()
                time_source = ct_content.split('\u6765\u81ea')

                time = time_source[0]
                if len(time_source) == 2:
                    source = time_source[1]
                else:
                    source = 'unknown'
                other = ';'.join(fields[1:])

            else:
                tweet['flag'] = '原创'
                text = tweet_div.get_text()
                ct_content = tweet_div.find('span', class_='ct').get_text()
                time_source = ct_content.split('\u6765\u81ea')

                time = time_source[0]
                if len(time_source) == 2:
                    source = time_source[1]
                else:
                    source = 'unknown'
                fields = text.split('\u200b')
                content = fields[0]
                other_fields = fields[-1].split('\xa0')
                other = ';'.join(other_fields[1:])

            like = re.findall(u'\u8d5e\[(\d+)\];', other)  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\];', other)  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\];', other)  # 评论数
            tweet['content'] = content.strip()
            tweet['id'] = tweet_div.get('id').strip('M_')
            tweet['time'] = self.get_time(str(time))
            tweet['source'] = source
            tweet['like'] = like[0] if like else -1
            tweet['transfer'] = transfer[0] if transfer else -1
            tweet['comment'] = comment[0] if comment else -1
            tweet['type'] = 'tweet_info'
            tweet['uid'] = usr_id
            print(tweet)
            await self.weibo_producer.send(tweet, search_job_info['url'])
            await self.redis_job.push_job(JobType.tweet.value,
                                          {'url': self.user_tweet_url % tweet['id'],
                                           'uid': usr_id})
            await self.redis_job.push_job(JobType.comment.value,
                                          {'url': self.tweet_comment_url % tweet['id'],
                                           'tweetId': tweet['id']})

    async def grab_user_tweet(self, tweet_job_info):
        LOGGER.info('start grab tweet: %s' % str(tweet_job_info))
        html_content = await self.grab_html(tweet_job_info['url'])

        user_tweet_html = BeautifulSoup(html_content, "lxml")
        tweet_divs = user_tweet_html.find_all(id=True, class_='c')
        for tweet_div in tweet_divs:
            tweet = {}
            if tweet_div.find(class_='cmt', string='转发理由:'):  # 转发
                tweet['flag'] = '转发'
                parent = tweet_div.find(class_='cmt', string='转发理由:').parent
                try:
                    comment_href = tweet_div.find_all('div')[-2].find('a', class_='cc').get('href')

                    href = comment_href.split('?')[0]
                    tweet['sourceTid'] = href.split('/')[-1]

                except Exception:
                    pass
                text = parent.get_text()
                fields = text.split('\xa0')

                content = fields[0][5:]
                ct_content = parent.find('span', class_='ct').get_text()
                time_source = ct_content.split('\u6765\u81ea')

                time = time_source[0]
                if len(time_source) == 2:
                    source = time_source[1]
                else:
                    source = 'unknown'
                other = ';'.join(fields[1:])

            else:
                tweet['flag'] = '原创'
                text = tweet_div.get_text()
                ct_content = tweet_div.find('span', class_='ct').get_text()
                time_source = ct_content.split('\u6765\u81ea')

                time = time_source[0]
                if len(time_source) == 2:
                    source = time_source[1]
                else:
                    source = 'unknown'
                fields = text.split('\u200b')
                content = fields[0]
                other_fields = fields[-1].split('\xa0')
                other = ';'.join(other_fields[1:])

            like = re.findall(u'\u8d5e\[(\d+)\];', other)  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\];', other)  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\];', other)  # 评论数
            tweet['content'] = content.strip()
            tweet['id'] = tweet_div.get('id')
            tweet['time'] = self.get_time(str(time))
            tweet['source'] = source
            tweet['like'] = like[0] if like else -1
            tweet['transfer'] = transfer[0] if transfer else -1
            tweet['comment'] = comment[0] if comment else -1
            tweet['type'] = 'tweet_info'
            tweet['uid'] = tweet_job_info['uid']

            await self.weibo_producer.send(tweet, tweet_job_info['url'])
            # 获取评论
            # self.comment_queue.put({'url': self.tweet_comment_url % tweet['id'][2:],
            #                         'tweetId': tweet['id'][2:]})

        if 'page=' not in tweet_job_info['url']:
            page_div = user_tweet_html.find(id='pagelist')
            if page_div:
                max_page = int(page_div.input.get('value'))
                if self.weibo_limit:
                    max_page = max_page if max_page < 500 else 500
                for page in range(2, max_page + 1):
                    await self.redis_job.push_job(JobType.tweet.value,
                                                  {'url': self.user_tweet_url2 % (tweet_job_info['uid'], page),
                                                   'uid': tweet_job_info['uid']})

    async def grab_user_info(self, user_id, source = 'unknown'):
        LOGGER.info('start grab user info: %s' % user_id)
        html_content = await self.grab_html(self.user_info_url % user_id)
        user_info_html = BeautifulSoup(html_content, "lxml")
        div_list = list(user_info_html.find_all(class_=['c', 'tip']))

        base_info_index, edu_info_index, work_info_index = -1, -1, -1
        base_info = ''
        edu_info = ''
        work_info = ''
        tags = ''
        user_info = {}
        for index, div in enumerate(div_list):
            text = div.text
            if text == u'基本信息':
                base_info_index = index
            elif text == u'学习经历':
                edu_info_index = index
            elif text == u'工作经历':
                work_info_index = index
        if base_info_index != -1:
            b = div_list[base_info_index + 1]
            tags = ','.join(map(lambda a: a.get_text(), b.find_all('a')))
            base_info = b.get_text(';')
        if edu_info_index != -1:
            edu_info = div_list[edu_info_index + 1].get_text(';')

        if work_info_index != -1:
            work_info = div_list[work_info_index + 1].get_text(';')
        base_info += ';'
        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', base_info)  # 昵称
        if nickname:
            user_info['nickname'] = nickname[0] if nickname else 'unknown'
            gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', base_info)  # 性别
            place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', base_info)  # 地区（包括省份和城市）
            signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', base_info)  # 个性签名
            birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', base_info)  # 生日
            sex_orientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', base_info)  # 性取向
            marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', base_info)  # 婚姻状况
            head_url = user_info_html.find('img', alt='头像')
            if head_url:
                user_info['head'] = head_url.get('src')
            user_info['tags'] = tags
            user_info['gender'] = gender[0] if gender else 'unknown'
            user_info['place'] = place[0] if place else 'unknown'
            user_info['signature'] = signature[0] if signature else 'unknown'
            user_info['birthday'] = birthday[0] if birthday else 'unknown'
            user_info['sexOrientation'] = sex_orientation[0] if sex_orientation else 'unknown'
            user_info['eduInfo'] = edu_info if edu_info else 'unknown'
            user_info['marriage'] = marriage[0] if marriage else 'unknown'
            user_info['workInfo'] = work_info if work_info else 'unknown'
            user_info['type'] = 'user_info'
            user_info['id'] = user_id
            user_info['source'] = source
            result = await self.grab_view(user_id)
            user_info.update(result)
            await self.weibo_producer.send(user_info, self.user_info_url % user_id)

    async def grab_view(self, user_id):
        """
        获取用户id的微博数、粉丝数、发布的微博数
        :param user_id: 用户id
        :return: dict
        """
        LOGGER.info('grab user view: %s' % str(user_id))
        html_content = await self.grab_html(self.weibo_host + '/' + str(user_id))
        home_page_html = BeautifulSoup(html_content, "lxml")
        v = home_page_html.find('div', class_='tip2')
        result = {}
        if v:
            content = v.get_text(';')
        else:
            content = ''
        tweet_r = re.findall('微博\[(\d+)\];', content)
        result['tweetNum'] = tweet_r[0] if tweet_r else -1
        fans_r = re.findall('粉丝\[(\d+)\];', content)
        result['fansNum'] = fans_r[0] if fans_r else -1
        follow_r = re.findall('关注\[(\d+)\];', content)
        result['followNum'] = follow_r[0] if follow_r else -1
        return result

    def get_time(self, time_str):
        current_result = self.time_current_pattern.findall(time_str)
        time_now = datetime.datetime.now()
        if current_result:
            result_time = time_now - datetime.timedelta(minutes=int(current_result[0]))
            return result_time.strftime('%Y-%m-%d %H:%M:%S')
        else:
            current_result = self.time_today_pattern.findall(time_str)
            if current_result:
                result_time = datetime.datetime(time_now.year, time_now.month,
                                                time_now.day, int(current_result[0][0]), int(current_result[0][0]))
                return result_time.strftime('%Y-%m-%d %H:%M:%S')
            else:
                current_result = self.time_year_pattern.findall(time_str)
                if current_result:
                    result_time = datetime.datetime(time_now.year, int(current_result[0][0]),
                                                    int(current_result[0][1]), int(current_result[0][2]),
                                                    int(current_result[0][3]))
                    return result_time.strftime('%Y-%m-%d %H:%M:%S')
                else:
                    return time_str

    @staticmethod
    async def grab_html2(session, url):
        with async_timeout.timeout(60):
            async with session.get(url, verify_ssl=False) as response:
                return await response.text()

    @staticmethod
    def grab_html2_now(session,headers, url, cookies):
        with session.get(url, cookies=cookies,verify=False) as response:
            return response.text

    async def post_grab2(self, session, url, data):
        with async_timeout.timeout(2 * 60):
            async with session.post(url=url, data=data, verify_ssl=False) as response:
                return await response.text()

    async def post_grab(self, url, data):
        cookies = await self.redis_cookie.fetch_cookies()
        LOGGER.info('using cookies' + str(cookies))
        async with aiohttp.ClientSession(cookies=cookies['cookies']) as session:
            return await self.post_grab2(session, url, data)

    async def grab_html(self, url):
        cookies = await self.redis_cookie.fetch_cookies()
        async with aiohttp.ClientSession(cookies=cookies['cookies']) as session:
            return await self.grab_html2(session, url)

    def grab_html_now(self, url):
        cookies = self.redis_cookie_now.fetch_cookies()
        headers = self.get_header()
        headers['Upgrade-Insecure-Requests'] = '1'
        headers['Proxy-Connection'] = 'keep-alive'
        LOGGER.info('using cookies'+str(cookies))
        ok = True
        while ok:
            resp_text = requests.get(url=url, cookies=cookies['cookies'], verify=False).text
            userjson = json.loads(resp_text)
            # userjson = json.loads(resp_text,'GBK')
            if userjson['ok'] == 1:
                ok = False
        return userjson['data']

    @staticmethod
    def get_header():
        header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Host': 'weibo.com',
            # 'Referer': 'https://weibo.com',
            'User-Agent': user_agents.USER_AGENTS[random.randint(0, len(user_agents.USER_AGENTS) - 1)]
        }
        return header

    @staticmethod
    def find_fm_view_json(html):
        resp_html = BeautifulSoup(html, 'html.parser')
        scripts = resp_html.find_all('script')
        scripts.reverse()
        fm_view_pattern = re.compile('FM.view\((.*)\)')
        view_jsons = []
        for script in scripts:
            print(script)
            r = fm_view_pattern.findall(str(script))
            if len(r):
                view_jsons.append(json.loads(r[0]))
        return view_jsons

    async def user_id_in_queue(self, user_id):
        if user_id and user_id not in self.bloom_filter:
            # LOGGER.info('%s in user queue.' % user_id)
            self.bloom_filter.add(user_id)
            await self.redis_job.push_job(JobType.user.value, {'user_id': user_id})

    async def get_user_id_from_homepage(self, home_page):
        html_content = await self.grab_html(home_page)
        home_page_html = BeautifulSoup(html_content, "lxml")
        info_a = home_page_html.find('a', string='资料')
        # LOGGER.info('get id from home page: %s' % home_page)
        if info_a:
            user_id = info_a.get('href').split('/')[1]
            # LOGGER.info('id got: %s' % user_id)
            return user_id
        return 0

    async def parse_tweet_content(self, html, job_info):
        tweet_div = html.find(id='M_', class_='c')
        if tweet_div:
            tweet_user_a = tweet_div.find('a')
            flag = False
            if tweet_user_a:
                tweet = {}
                tweet_user_href = tweet_user_a.get('href')
                if tweet_user_href.startswith('/u/'):
                    tweet_user_id = tweet_user_href[3:]
                else:
                    tweet_user_id = await self.get_user_id_from_homepage(self.weibo_host + tweet_user_href)
                await self.user_id_in_queue(tweet_user_id)
                if tweet_div.find(class_='cmt', string='转发理由:'):
                    tweet['flag'] = '转发'
                    parent = tweet_div.find(class_='cmt', string='转发理由:').parent
                    try:
                        comment_href = tweet_div.find_all('div')[-2].find('a', class_='cc').get('href')

                        href = comment_href.split('?')[0]
                        tweet['sourceTid'] = href.split('/')[-1]

                    except Exception:
                        pass
                    text = parent.get_text()
                    fields = text.split('\xa0')
                    flag = True
                    content = fields[0][5:]
                    tweet['content'] = content.strip()
                    # ct_content = parent.find('span', class_='ct').get_text()
                    # time_source = ct_content.split('\u6765\u81ea')
                    #
                    # time = time_source[0]
                    # if len(time_source) == 2:
                    #     source = time_source[1]
                    # else:
                    #     source = 'unknown'
                    # other = ';'.join(fields[1:])
                else:
                    tweet_content = tweet_div.find('span', class_='ctt').get_text()
                    tweet['content'] = tweet_content.strip()
                tweet_details = list(
                    filter(lambda div: div.find(class_='pms'),
                           html.find_all('div', id=False, class_=False)))
                tweet['sourceTid'] = job_info['parentTid'] if 'parentTid' in job_info \
                    else tweet['sourceTid'] if flag else ''
                detail = tweet_details[0].get_text(';').replace('\xa0', '')
                like = re.findall(u'\u8d5e\[(\d+)\];', detail)  # 点赞数
                transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\];', detail)  # 转载数
                comment = re.findall(u'\u8bc4\u8bba\[(\d+)\];', detail)  # 评论数
                tweet['id'] = job_info['tweetId']
                tweet['like'] = like[0] if like else 0
                tweet['transfer'] = transfer[0] if transfer else 0
                tweet['comment'] = comment[0] if comment else 0
                tweet['type'] = 'tweet_info'
                # if flag:
                #     await self.weibo_producer.send(tweet, job_info['url'])
                # else:
                others = tweet_div.find(class_='ct').get_text()
                if others:
                    others = others.split('\u6765\u81ea')
                    tweet['time'] = self.get_time(others[0])
                    if len(others) == 2:
                        tweet['source'] = others[1]
                tweet['uid'] = tweet_user_id
                await self.weibo_producer.send(tweet, job_info['url'])
                return tweet
        return None

    async def grab_tweet_repost(self, repost_job_info):
        LOGGER.info('start grab tweet repost: %s' % str(repost_job_info))

        html_content = await self.grab_html(repost_job_info['url'])
        tweet_repost_html = BeautifulSoup(html_content, "lxml")
        repost_divs = tweet_repost_html.find_all(class_='c')
        for div in repost_divs:
            span_cc = div.find('span', class_='cc')
            if span_cc:
                attitube_a = span_cc.find('a')
                if attitube_a:
                    href = attitube_a.get('href')
                    if len(href.split('/')) > 2:
                        await self.redis_job.push_job(JobType.comment.value,
                                                      {'url': self.tweet_comment_url % href.split('/')[2],
                                                       'tweetId': href.split('/')[2],
                                                       'parentTid': repost_job_info['tweetId']})
                        await self.redis_job.push_job(JobType.repost.value,
                                                      {'url': self.user_repost_url % href.split('/')[2],
                                                       'tweetId': href.split('/')[2],
                                                       'parentTid': repost_job_info['tweetId']})
        if 'page=' not in repost_job_info['url']:
            await self.parse_tweet_content(tweet_repost_html, repost_job_info)
            page_div = tweet_repost_html.find(id='pagelist')
            if page_div:

                max_page = int(page_div.input.get('value'))
                for page in range(2, max_page + 1):
                    await self.redis_job.push_job(JobType.repost.value,
                                                  {'url': self.user_repost_url2 % (repost_job_info['tweetId'], page),
                                                   'tweetId': repost_job_info['tweetId']})
        pass

    async def grab_tweet_comments(self, comment_job):
        LOGGER.info('start grab comment: %s' % str(comment_job))
        html_content = await self.grab_html(comment_job['url'])
        comment_html = BeautifulSoup(html_content, "lxml")

        comment_divs = comment_html.find_all(id=re.compile('C_[\d]'), class_='c')
        for comment_div in comment_divs:
            comment_info = {}
            comment_id = comment_div.get('id')
            user_a = comment_div.find('a')
            if user_a:
                user_href = user_a.get('href')
                if user_href.startswith('/u/'):
                    user_id = user_href[3:]
                else:
                    user_id = await self.get_user_id_from_homepage(self.weibo_host + user_href)
                await self.user_id_in_queue(user_id)
                comment_info['userId'] = user_id
                comment_info['content'] = comment_div.find(class_='ctt').get_text()
                others = comment_div.find(class_='ct').get_text()
                if others:
                    others = others.split('\u6765\u81ea')
                    comment_info['pubTime'] = self.get_time(others[0])
                    if len(others) == 2:
                        comment_info['source'] = others[1]
                comment_info['id'] = comment_id
                comment_info['tweetId'] = comment_job['tweetId']
                comment_info['type'] = 'comment_info'
                await self.weibo_producer.send(comment_info, comment_job['url'])

        if 'page=' not in comment_job['url']:
            await self.parse_tweet_content(comment_html, comment_job)
            page_div = comment_html.find(id='pagelist')
            if page_div:

                max_page = int(page_div.input.get('value'))
                for page in range(2, max_page + 1):
                    await self.redis_job.push_job(JobType.comment.value,
                                                  {'url': self.tweet_comment_url2 % (comment_job['tweetId'], page),
                                                   'tweetId': comment_job['tweetId']})

    async def topic_finding(self):
        while True:
            topic_job_info = self.redis_job_now.fetch_job(JobType.topic.value)
            if topic_job_info:
                try:
                    print(topic_job_info)
                    LOGGER.info('topic finding')
                    await self.search_topic_user(topic_job_info)
                except TimeoutError as e:
                    LOGGER.info('topic finding timeout error')
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    async def super_fan_finding(self):
        while True:
            topic_job_info = self.redis_job_now.fetch_job(JobType.superfan.value)
            if topic_job_info:
                try:
                    print(topic_job_info)
                    LOGGER.info('super fan finding')
                    await self.search_super_fan(topic_job_info)
                except TimeoutError as e:
                    LOGGER.info('super fan finding timeout error')
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)

    def push_topic_job_now(self):
        self.redis_job_now.push_job(
            JobType.topic.value, {'url': 'https://m.weibo.cn/api/container/getIndex?containerid=1008084d899324c66df69e0248e385a7eccca2_-_feed&page=1'})

    async def push_topic_job(self):
        for page in range(1,self.topic_max_page):
            await self.redis_job_now.push_job(JobType.topic.value, {
                'url': self.topic_url % page})

    def push_topic_job_all_now(self):
        for page in range(1, self.topic_max_page+1):
            self.redis_job_now.push_job(JobType.topic.value, {
                'url': self.topic_url % page})


    def push_super_new_fan_job_all_now(self):
        for page in range(1, self.super_new_fan_max_page+1):
            self.redis_job_now.push_job(JobType.superfan.value, {
                'url': self.super_new_fan_url % page})

    def topic_finding_now(self):
        topic_job_info = self.redis_job_now.fetch_job(JobType.topic.value)
        if topic_job_info:
            try:
                print(topic_job_info)
                LOGGER.info('topic finding')
                self.search_topic_user_now(topic_job_info)
            except TimeoutError as e:
                LOGGER.info('topic finding timeout error')
                pass
            except:
                LOGGER.error(traceback.format_exc())
                sleep(5 * 60)

    def search_topic_user_now(self, topic_job_info):
        LOGGER.info('try to grab html: ' + topic_job_info['url'])
        userjson = self.grab_html_now(topic_job_info['url'])
        LOGGER.info('succeed to grab html: ' + topic_job_info['url'])
        for group in userjson['cards']:
            if 'show_type' in group:
                for card in group['card_group']:
                    user_id = card['mblog']['user']['id']
                    self.redis_job_now.push_job(JobType.user.value, {'user_id': user_id, 'source': 'comment'})

    async def search_topic_user(self, topic_job_info):
        LOGGER.info('try to grab html: ' + topic_job_info['url'])
        html_content = await self.grab_html(topic_job_info['url'])
        userjson = json.loads(html_content)
        userjson = userjson['data']
        LOGGER.info('succeed to grab html: ' + topic_job_info['url'])
        for group in userjson['cards']:
            if 'show_type' in group:
                for card in group['card_group']:
                    user_id = card['mblog']['user']['id']
                    await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'comment'})

    async def search_super_fan(self, super_fan_job_info):
        LOGGER.info('try to grab html: ' + super_fan_job_info['url'])
        html_content = await self.grab_html(super_fan_job_info['url'])
        userjson = json.loads(html_content, 'GBK')
        userjson = userjson['data']
        LOGGER.info('succeed to grab html: ' + super_fan_job_info['url'])
        for group in userjson['cards']:
            for card in group['card_group']:
                user_id = card['user']['id']
                await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'super'})


    def start(self, args):
        LOGGER.info(str(args))
        workers = []
        if 'f' in args:#关注
            workers += [asyncio.Task(self.crawl_follow(), loop=self.loop) for _ in range(self.tasks)]
        if 'o' in args:#粉丝
            workers += [asyncio.Task(self.crawl_fan(), loop=self.loop) for _ in range(self.tasks)]
        if 'c' in args:#评论
            workers += [asyncio.Task(self.crawl_comment(), loop=self.loop) for _ in range(self.tasks)]
        if 'u' in args:#用户
            workers += [asyncio.Task(self.crawl_user(), loop=self.loop) for _ in range(self.tasks)]
        if 'w' in args:#微博内容
            workers += [asyncio.Task(self.crawl_weibo(), loop=self.loop) for _ in range(self.tasks)]
        if 'r' in args:#转发
            workers += [asyncio.Task(self.crawl_repost(), loop=self.loop) for _ in range(self.tasks)]
        if 's' in args:#搜索
            workers += [asyncio.Task(self.search(), loop=self.loop) for _ in range(self.tasks)]
        if 't' in args:#话题帖子
            workers += [asyncio.Task(self.topic_finding(), loop=self.loop) for _ in range(self.tasks)]
            for _ in range(10):
                self.topic_finding_now()
        if 'a' in args:#名人堂
            workers += [asyncio.Task(self.super_fan_finding(), loop=self.loop) for _ in range(self.tasks)]
        if 'i' in args:
            self.push_topic_job_all_now()
            sleep(5)
        if 'n' in args:
            self.push_super_new_fan_job_all_now()
            sleep(5)
        if workers:
            self.loop.run_until_complete(asyncio.wait(workers))

Exemplo n.º 23

0

Exibir arquivo

Arquivo: luntan_end.py Projeto: Flat-Chen/ChenProject

 data_dic = dict()
 content = row["content"]
 if tab["weidu"] in content:
     count = 0
     for zmxr in tab["zmxr"]:
         if zmxr in content:
             count += 1
             data_dic["weidu"] = tab["weidu"]
             data_dic["sword"] = zmxr
             data_dic["taidu"] = 'good'
             data_dic["variable"] = row["variable"]
             data_dic["row_names"] = row["row_names"]
             data_dic["value"] = row["value"]
             status = row["row_names"] + '_' + tab["weidu"] + '_' + zmxr
             i = md5(status.encode("utf8")).hexdigest()
             returndf = bf.add(i)
             if not returndf:
                 items = list()
                 items.append(data_dic)
                 save_df = pd.DataFrame(items)
                 save_df.to_sql(name='content_luntan_test1',
                                con=conn,
                                if_exists="append",
                                index=False)
                 print("-" * 50 + "insert data" + "-" * 50)
             else:
                 print("重复数据!")
     if count == 0:
         for z in tab["zxxr"]:
             if z in content:
                 count += 1

Exemplo n.º 24

0

Exibir arquivo

Arquivo: live.py Projeto: WenhongXu/zhihu_scraper

xhr_headers = {
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
    'Host': 'api.zhihu.com',
    'Access-Control-Request-Method': 'GET',
    'Connection': 'keep-alive'
}

# 保存已存在的live的id到id_list以及过滤器
b = ScalableBloomFilter(10000, 0.001)
id_list = []
with open('live_id.txt', 'r+') as f:
    for line in f.readlines():
        id = line.strip()
        id_list.append(id)
        [b.add(id)]

# 保存已存在的people的id到过滤器
bb = ScalableBloomFilter(10000, 0.001)
with open('peoples.txt', 'r+') as f:
    for line in f.readlines():
        id = line.strip()
        if not id in bb:
            [bb.add(id)]

# # 找出所有最新的live，去重，存在id_list和过滤器以及文件中

tag_list = [
    101, 102, 103, 104, 105, 106, 107, 108, 109, 201, 202, 203, 301, 302, 303,
    304, 305
]

Exemplo n.º 25

0

Exibir arquivo

class ChexiuPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def __init__(self, settings):
        # mongo
        self.connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db = self.connection[settings['MONGODB_DB']]
        website = settings["WEBSITE"]
        # local_time = time.strftime('%Y-%m-%d', time.localtime())
        # if website in ["pcauto_price", "yiche_price", "autohome_price"]:
        #     self.collection = db[settings['MONGODB_COLLECTION'] + '_' + str(local_time)]
        # else:
        self.collection = db[settings['MONGODB_COLLECTION']]
        # bloom file
        self.CrawlCar_Num = 1000000
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB'] + '/' + settings[
            'MONGODB_COLLECTION'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB']
        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01)
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
        self.counts = 0

    def process_item(self, item, spider):
        if spider.name in ["chexiuSpider", "chexiu_car"]:
            valid = True
            i = md5(item['status'].encode("utf8")).hexdigest()
            returndf = self.df.add(i)
            if returndf:
                valid = False
                raise DropItem("Drop data {0}!".format(item["status"]))
            else:
                self.fa.writelines(i + '\n')
                self.collection.insert(dict(item))
                logging.log(msg="Car added to MongoDB database!", level=logging.INFO)
                self.counts += 1
                logging.log(msg="scrapy                    " + str(self.counts) + "                  items",
                            level=logging.INFO)
                # return item
        elif spider.name in ["autohome_price_new", "yiche_price", "pcauto_price", "58car_price"]:
            self.collection.insert(dict(item))
            logging.log(msg="Car added to MongoDB database!", level=logging.INFO)
            self.counts += 1
            logging.log(msg="scrapy                    " + str(self.counts) + "                  items",
                        level=logging.INFO)
        else:
            self.collection.insert(dict(item))
            logging.log(msg="Car added to MongoDB database!", level=logging.INFO)
            self.counts += 1
            logging.log(msg="scrapy                    " + str(self.counts) + "                  items",
                        level=logging.INFO)

    def close_spider(self, spider):
        self.connection.close()

        # self.fa.close()

    def dingmessage(self):
        # 请求的URL，WebHook地址
        webhook = "https://oapi.dingtalk.com/robot/send?access_token=633758ccd22b7db4d2e9655488af7d3f5d5e0b2a32c701c80fc3cd57981e73a9"
        # 构建请求头部
        header = {
            "Content-Type": "application/json",
            "Charset": "UTF-8"
        }
        # 构建请求数据
        tex = "-车秀网爬虫结束-"
        message = {

            "msgtype": "text",
            "text": {
                "content": tex
            },
            "at": {

                "isAtAll": False
            }

        }
        # 对请求的数据进行json封装
        message_json = json.dumps(message)
        # 发送请求
        info = requests.post(url=webhook, data=message_json, headers=header)
        # 打印返回的结果
        print(info.text)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: pipelines.py Projeto: Flat-Chen/ChenProject

class CarbuisnessNewPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def __init__(self, settings):
        # mongo
        self.connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db = self.connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]
        self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"]
        # bloom file
        self.mongocounts = 0
        self.counts = 0
        self.CrawlCar_Num = 1000000
        self.settings = settings
        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']

        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01)
        # # read
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
        self.counts = 0

    def process_item(self, item, spider):
        if spider.name in ["autohome_error_new", "jzg_price_master", ]:
            # if item["newcar_bug_num"] is not None or item["oldcar_bug_num"] is not None or item["oldcar_bug_ratio"] is not None or item["newcar_bug_ratio"] is not None:
            self.collection.insert(dict(item))
            logging.log(msg="Car added to MongoDB database!", level=logging.INFO)
            self.counts += 1
            logging.log(msg="scrapy                    " + str(self.counts) + "                  items",
                        level=logging.INFO)
        else:
            if spider.name in ['all_location', 'jzg_price', 'jzg_price_sh', 'xiaozhu_modellist', 'xiaozhu_gz',
                               'autohome_gz', 'jzg_modellist']:
                # print("*"*100)
                valid = True
                i = md5(item['status'].encode("utf8")).hexdigest()
                returndf = self.df.add(i)
                if returndf:
                    valid = False
                    raise DropItem("Drop data {0}!".format(item["status"]))
                else:
                    pass
                if valid:
                    self.fa.writelines(i + '\n')
                    self.collection.insert(dict(item))
                    logging.log(msg="Car added to MongoDB database!", level=logging.INFO)
                    self.counts += 1
                    logging.log(msg="scrapy                    " + str(self.counts) + "                  items",
                                level=logging.INFO)
            if spider.name in ['autohome_url']:
                pass

    def close_spider(self, spider):
        self.connection.close()
        self.fa.close()

Exemplo n.º 27

0

Exibir arquivo

Arquivo: agent.py Projeto: thejeshgn/python-zentropi

class Agent(Zentropian):
    def __init__(self, name=None):
        self.timers = TimerRegistry(callback=self._trigger_frame_handler)
        super().__init__(name=name)
        self.states.should_stop = False
        self.states.running = False
        self.loop = None  # asyncio.get_event_loop()
        self._spawn_on_start = set()
        self._seen_frames = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH, error_rate=0.001)

    @on_state('should_stop')
    def _on_should_stop(self, state):
        if state.data.last is False and state.data.value is True:  # skip double close
            self.close()
        return True

    async def _run_forever(self):
        # atexit.register(self.loop.close)
        if self._spawn_on_start:
            [self.spawn(coro) for coro in self._spawn_on_start]
            self._spawn_on_start = None
        self.emit('*** started', internal=True)
        self.timers.start_timers(self.spawn)
        while self.states.should_stop is False:
            await asyncio.sleep(1)
        self.emit('*** stopped', internal=True)

    def _set_asyncio_loop(self, loop=None):
        if self.loop and loop:
            raise AssertionError('Agent already has an event loop set.')
        if loop:
            self.loop = loop
        if not self.loop:
            try:
                self.loop = asyncio.get_event_loop()
            except RuntimeError:
                self.loop = asyncio.new_event_loop()

    def _trigger_frame_handler(self, frame: Frame, handler: Handler, internal=False):
        if isinstance(frame, Message) and frame.source == self.name:
            return
        if isinstance(frame, Event) and frame.source != self.name and frame.name.startswith('***'):
            return
        if frame and frame.id in self._seen_frames:
            return
        if not self.apply_filters([handler]):
            return
        if frame:
            self._seen_frames.add(frame.id)
        payload = []  # type: list
        if handler.pass_self:
            payload.append(self)
        if handler.kind != KINDS.TIMER:
            payload.append(frame)
        if handler.run_async:
            async def return_handler():
                ret_val = await handler(*payload)
                if ret_val:
                    self.handle_return(frame, return_value=ret_val)

            self.spawn(return_handler())
        else:
            ret_val = handler(*payload)
            if ret_val:
                return self.handle_return(frame, return_value=ret_val)

    def add_handler(self, handler):
        if handler.kind == KINDS.TIMER:
            self.timers.add_handler(handler.name, handler)
        else:
            super().add_handler(handler)

    def on_timer(self, interval):
        def wrapper(handler):
            name = str(interval)
            handler_obj = Handler(kind=KINDS.TIMER, name=name, handler=handler)
            self.timers.add_handler(name, handler_obj)
            return handler

        return wrapper

    @staticmethod
    def sleep(duration: float):
        return asyncio.sleep(duration)

    def start(self, loop=None):
        self._set_asyncio_loop(loop)
        self.loop.create_task(self._run_forever())

    def run(self):
        self._set_asyncio_loop()
        self.loop.run_until_complete(self._run_forever())

    def spawn(self, coro):
        if not self.loop:
            self._spawn_on_start.add(coro)
            return
        return self.loop.create_task(coro)

    @staticmethod
    def spawn_in_thread(func, *args, **kwargs):
        task = threading.Thread(target=func, args=args, kwargs=kwargs)
        task.start()
        return task

    def run_in_thread(self):
        return self.spawn_in_thread(self.run)

    def stop(self):
        self.emit('*** stopping', internal=True)
        self.states.should_stop = True
        self.timers.should_stop = True

    def connect(self, endpoint, *, auth=None, tag='default'):
        retval = super().connect(endpoint, auth=auth, tag=tag)
        if not isgeneratorfunction(retval):
            return
        self.spawn(retval)

    def bind(self, endpoint, *, tag='default'):
        retval = super().bind(endpoint, tag=tag)
        if not isgeneratorfunction(retval):
            return
        self.spawn(retval)

    def join(self, space, *, tags: Optional[Union[list, str]] = None):
        retval = super().join(space, tags=tags)
        if not isgeneratorfunction(retval):
            return
        self.spawn(retval)

    def leave(self, space, *, tags: Optional[Union[list, str]] = None):
        retval = super().leave(space, tags=tags)
        if not isgeneratorfunction(retval):
            return
        self.spawn(retval)

    def close(self, *, endpoint: Optional[str] = None, tags: Optional[Union[list, str]] = None):
        """Closes all connections if no endpoint or tags given."""
        if endpoint and tags:
            raise ValueError('Expected either endpoint: {!r} or tags: {!r}.'
                             ''.format(endpoint, tags))
        elif endpoint:
            connections = self._connections.connections_by_endpoint(endpoint)
        elif tags:
            connections = self._connections.connections_by_tags(tags)
        else:
            connections = self._connections.connections
        for connection in connections:
            connection.close()

Exemplo n.º 28

0

Exibir arquivo

def antcolony_userV4(token,
                     domain,
                     bloom: ScalableBloomFilter,
                     key: str,
                     dataobject,
                     keylist=None,
                     lamda=20,
                     xhr_headers=XHR_HEADER_WZ):
    def keyconvert(keys, dict):
        c = {}
        for key in dict.keys():
            if key in keys:
                c[key] = dict[key]
        return c

    urlhead = create_userV4(domain, token)
    req = getRequest()
    people_url = urlhead + paging(0, 5)
    print(people_url)
    print('before r')
    r = req.get(people_url, headers=xhr_headers)
    print('after r')
    if int(r.status_code) == 410:
        dataobject.delone(token)
        return 0
    if int(r.status_code) > 300:
        raise WrongStatuCode(str(r.status_code) + ': ' + people_url)
    j = json.loads(r.text)
    if 'error' in j:
        raise ErrorInJson(__name__ + ": from url=" + urlhead + '\n  msg=' +
                          j['error'])
    print(j['paging'])
    try:
        total = int(j['paging']['totals'])
    except:
        total = None
    if total:
        print(total)
        print(round(total / lamda))
        for i in range(round(total / lamda) + 1):
            urll = urlhead + paging(i * lamda, lamda)
            r = req.get(urll, headers=xhr_headers)
            # print(r)
            jc = json.loads(r.text, encoding='utf-8')
            data = jc['data']

            if keylist:
                data = [keyconvert(keylist, x) for x in data]
            for i in data:
                if i[key] not in bloom:
                    dataobject.insert(i)
                    bloom.add(i[key])
                    print('data import ' + i[key])
                else:
                    print('pass')
    else:
        i = 0
        while True:
            urll = urlhead + paging(int(i * lamda), lamda)
            r = req.get(urll, headers=xhr_headers)
            jc = json.loads(r.text, encoding='utf-8')
            data = jc['data']
            if len(data) == 0:
                break
            if keylist:
                data = [keyconvert(keylist, x) for x in data]
            for c in data:
                if c[key] not in bloom:
                    dataobject.insert(c)
                    bloom.add(c[key])
                    print('data import ' + c[key])
                else:
                    print('pass')
            i += 1

Exemplo n.º 29

0

Exibir arquivo

class TaochePipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def __init__(self, settings):
        # mongo
        self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                              settings['MONGODB_PORT'])
        db = self.connection[settings['MONGODB_DB']]
        website = settings["WEBSITE"]
        # local_time = time.strftime('%Y-%m-%d', time.localtime())
        # if website in ["pcauto_price", "yiche_price", "autohome_price"]:
        #     self.collection = db[settings['MONGODB_COLLECTION'] + '_' + str(local_time)]
        # else:
        self.collection = db[settings['MONGODB_COLLECTION']]
        # bloom file
        self.CrawlCar_Num = 1000000
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MONGODB_DB'] + '/' + settings['MONGODB_COLLECTION'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB']
        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)
        self.counts = 0

    def process_item(self, item, spider):
        if spider.name in ["taoche_car", "taoche_gz"]:
            valid = True
            i = md5(item['status'].encode("utf8")).hexdigest()
            returndf = self.df.add(i)
            if returndf:
                valid = False
                raise DropItem("Drop data {0}!".format(item["status"]))
            else:
                self.fa.writelines(i + '\n')
                self.collection.insert(dict(item))
                logging.log(msg="Car added to MongoDB database!",
                            level=logging.INFO)
                self.counts += 1
                logging.log(msg="scrapy                    " +
                            str(self.counts) + "                  items",
                            level=logging.INFO)
                # return item
        elif spider.name in [
                "autohome_price_new", "yiche_price", "pcauto_price",
                "58car_price"
        ]:
            self.collection.insert(dict(item))
            logging.log(msg="Car added to MongoDB database!",
                        level=logging.INFO)
            self.counts += 1
            logging.log(msg="scrapy                    " + str(self.counts) +
                        "                  items",
                        level=logging.INFO)
        else:
            self.collection.insert(dict(item))
            logging.log(msg="Car added to MongoDB database!",
                        level=logging.INFO)
            self.counts += 1
            logging.log(msg="scrapy                    " + str(self.counts) +
                        "                  items",
                        level=logging.INFO)

    def close_spider(self, spider):
        self.connection.close()

        self.fa.close()

Exemplo n.º 30

0

Exibir arquivo

    # Initialise a crawling dataset connection
    print(colored('Initialising wikipedia crawling collection...', 'cyan'))
    crawl_collection = init_crawl_collection()

    # Iterate through the crawling database
    n = 0
    print(colored('Iterating over crawling database...', 'cyan'))
    bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    for topic, sentence in iter_topic(crawl_collection, args['start']):

        # Clean topic string
        topic = topic.replace("'", '').replace('\n', '')

        # Check if the number of processed topic exceed the limit?
        if topic not in bf:
            bf.add(topic)
            if len(bf) > args['limit']:
                print(colored('[Topics limit reached] ... BYE', 'cyan'))
                sys.exit(0)

        # Break the sentence into knowledge nodes
        pos = TextStructure.pos_tag(sentence)
        kb_nodes = patterns.capture(pos)

        # Clean up each of the nodes
        # a) Remove stopwords
        # b) Remove duplicates
        # c) Ensure supported encoding
        kb_nodes = ensure_viable(kb_nodes, stopwords)

        if args['verbose']:

Exemplo n.º 31

0

Exibir arquivo

class YongdaPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        # 获取配置中的时间片个数，默认为12个，1分钟
        idle_number = crawler.settings.getint('IDLE_NUMBER', 6)
        # 实例化扩展对象
        ext = cls(crawler.settings, idle_number, crawler)
        # 将扩展对象连接到信号， 将signals.spider_idle 与 spider_idle() 方法关联起来。
        crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
        return ext

    def __init__(self, settings, idle_number, crawler):
        # mysql
        self.conn = create_engine(
            f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8'
        )

        # mongo
        # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/'
        # self.connection = pymongo.MongoClient(uri)
        # self.connection = pymongo.MongoClient(
        #     settings['MONGODB_SERVER'],
        #     settings['MONGODB_PORT']
        # )
        # db = self.connection[settings['MONGODB_DB']]
        # self.collection = db[settings['MONGODB_COLLECTION']]

        # count
        self.mongocounts = 0
        self.counts = 0
        self.CrawlCar_Num = 1000000
        self.settings = settings

        # redis 信号
        self.crawler = crawler
        self.idle_number = idle_number
        self.idle_list = []
        self.idle_count = 0

        # bloom file
        filename = str(pathlib.Path.cwd()) + '/blm/' + settings[
            'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm'
        dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB']

        # scrapy date
        self.start_date = None
        self.end_date = None
        self.scrapy_date = f'{self.start_date}  -   {self.end_date}'

        # dataframe
        self.df_result = pd.DataFrame()

        # 布隆过滤
        self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num,
                                      error_rate=0.01)
        # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01)

        # read
        if os.path.exists(dirname):
            if os.path.exists(filename):
                self.fa = open(filename, "a")
            else:
                pathlib.Path(filename).touch()
                self.fa = open(filename, "a")
        else:
            os.makedirs(dirname)
            pathlib.Path(filename).touch()
            self.fa = open(filename, "a")

        with open(filename, "r") as fr:
            lines = fr.readlines()
            for line in lines:
                line = line.strip('\n')
                self.df.add(line)

    def open_spider(self, spider):
        self.start_date = time.strftime('%Y-%m-%d %X', time.localtime())

    def process_item(self, item, spider):
        if spider.name == '':
            valid = True
            i = md5(item[''].encode("utf8")).hexdigest()
            returndf = self.df.add(i)
            if returndf:
                valid = False
                raise DropItem("Drop data {0}!".format(item["statusplus"]))
            else:
                pass
            if valid:
                self.fa.writelines(i + '\n')
                # 数据存入mysql
                items = list()
                items.append(item)
                df = pd.DataFrame(items)
                df.to_sql(name=self.settings['MYSQL_TABLE'],
                          con=self.conn,
                          if_exists="append",
                          index=False)
                logging.log(
                    msg=
                    f"scrapy              {self.mongocounts}              items",
                    level=logging.INFO)
                self.mongocounts += 1
                # if spider.name == '':
                # self.df_result = pd.concat([self.df_result, df])
                # self.mongocounts += 1
                # logging.log(msg=f"add              {self.mongocounts}              items", level=logging.INFO)
                # else:
                # df.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False)
                # self.mongocounts += 1
                # logging.log(msg=f"scrapy              {self.mongocounts}              items", level=logging.INFO)

    def close_spider(self, spider):
        # self.connection.close()
        # if spider.name == '':
        # self.df_result.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False)
        self.conn.dispose()
        self.end_date = time.strftime('%Y-%m-%d %X', time.localtime())
        self.scrapy_date = f'{self.start_date}  -   {self.end_date}'
        logging.info(self.scrapy_date)

    def spider_idle(self, spider):
        self.idle_count += 1  # 空闲计数
        self.idle_list.append(time.time())  # 每次触发 spider_idle时，记录下触发时间戳
        idle_list_len = len(self.idle_list)  # 获取当前已经连续触发的次数
        print(self.scrapy_date)
        logging.info(self.scrapy_date)
        # print(idle_list_len)
        # print(self.idle_count)
        # print(self.idle_list[-1] - self.idle_list[-2])
        # 判断 当前触发时间与上次触发时间 之间的间隔是否大于5秒，如果大于5秒，说明redis 中还有key
        if idle_list_len > 2 and not (1 <
                                      (self.idle_list[-1] - self.idle_list[-2])
                                      < 6):
            self.idle_list = [self.idle_list[-1]]
            self.idle_count = 1

        elif idle_list_len == self.idle_number + 1:
            # 空跑一分钟后记录结束时间
            self.end_date = time.strftime('%Y-%m-%d %X', time.localtime())
            self.scrapy_date = f'{self.start_date}  -   {self.end_date}'
            self.start_date = time.strftime('%Y-%m-%d %X', time.localtime())
            print(self.scrapy_date)
            print("*" * 100)

        elif idle_list_len > self.idle_number + 12:
            # 空跑一分钟后重置起始时间
            self.start_date = time.strftime('%Y-%m-%d %X', time.localtime())
            self.idle_count = 0

Exemplo n.º 32

0

Exibir arquivo

Arquivo: build_knowledge.py Projeto: starcolon/vor-knowledge-graph

  # Initialise a crawling dataset connection
  print(colored('Initialising wikipedia crawling collection...','cyan'))
  crawl_collection = init_crawl_collection()

  # Iterate through the crawling database
  n = 0
  print(colored('Iterating over crawling database...','cyan'))
  bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
  for topic,sentence in iter_topic(crawl_collection,args['start']):
    
    # Clean topic string
    topic = topic.replace("'",'').replace('\n','')

    # Check if the number of processed topic exceed the limit?
    if topic not in bf:
      bf.add(topic)
      if len(bf) > args['limit']:
        print(colored('[Topics limit reached] ... BYE','cyan'))
        sys.exit(0)

    # Break the sentence into knowledge nodes
    pos      = TextStructure.pos_tag(sentence)
    kb_nodes = patterns.capture(pos)  

    # Clean up each of the nodes
    # a) Remove stopwords
    # b) Remove duplicates
    # c) Ensure supported encoding
    kb_nodes = ensure_viable(kb_nodes, stopwords)

    if args['verbose']: