Пример #1
0
    def push_random_kols_to_queue(self, base_path='..', number=300):
        print(
            "Push %s random kols id to smcc service to get their post in the next crawl"
            % str(number))
        channel = self._connection.channel()
        queue_name = 'facebook_scanning'
        queue_state = channel.queue_declare(queue_name, durable=True)

        with open_utf8_file_to_read(
                get_independent_os_path(
                    [base_path, 'backend', 'input',
                     'kols_list.txt'])) as stream:
            kols_list = [
                x.strip() for x in stream.read().split('\n') if x.strip() != ''
            ]
        choosen = set()
        count = 0
        number_of_kols = len(kols_list)
        while count < number:
            index = randint(0, number_of_kols - 1)
            kol_id = kols_list[index]
            if kol_id not in choosen:
                choosen.add(kol_id)
                count += 1

        for kol_id in choosen:
            body = kol_id
            print(kol_id)
            channel.basic_publish(exchange='',
                                  routing_key=queue_name,
                                  body=body)
Пример #2
0
    def load_data(self,
                  crawl_newspaper=True,
                  crawl_kols=False,
                  crawl_kols_by_smcc=False,
                  random_kols=True,
                  random_fb_account=True,
                  max_kols=5,
                  base_path=None):
        '''
        input
        -----
        crawl_newspaper: crawl newspaper configs in /backend/input/config.yaml
        crawl_kols: crawl kols post from kols id list in /backend/input/kols_list.txt by using facebook bot
            random_kols: choose random (max_kols) from kols list to crawl
            random_fb_account: set random fb bot (presetup in /backend/input/fb_list.txt file to crawl kol posts
        crawl_kols_by_smcc: crawl kols posts using smcc service (push some kol id to queue and get back post from queue). Choose random kols id (100 in number) by default and create only one webconfig with crawl_type = "kols smcc"
        base_path: specify path to resources folder 
        '''
        if not base_path:
            base_path = os.environ['DOCBAO_BASE_DIR']

        #print(self._config)
        stream = open_utf8_file_to_read(self._filename)
        self._config = yaml.full_load(stream)
        stream.close()

        newspaper_list = []

        if not crawl_newspaper:
            self.replace_crawl_list([])
        else:
            newspaper_list = self.get_newspaper_list(
            )  # crawl newspaper last to init browser with random profiles first
            self.replace_crawl_list([])

        if crawl_kols:

            # get kols_list
            kols_list = []
            with open_utf8_file_to_read(self._kol_filename) as stream:
                kols_list = [
                    x for x in stream.read().split('\n') if x.strip() != ''
                ]

            # get fb account list
            fb_list = []
            if random_fb_account:
                with open_utf8_file_to_read(
                        self._fb_account_filename) as stream:
                    fb_list = [
                        x for x in stream.read().split('\n') if x.strip() != ''
                    ]

            count = 0
            index = 0
            choosen = set()

            while count < max_kols and count < len(
                    kols_list):  # finish when get max_kols
                count += 1
                if random_kols:
                    index = random.randint(0, len(kols_list) - 1)
                    while index in choosen:  # no repeat value
                        index = random.randint(0, len(kols_list) - 1)
                    choosen.add(index)
                    print(f"Choose random kols: {kols_list[index]}"
                          )  # print choosen kol for debugging
                else:
                    index += 1
                    if index == len(kols_list):  # end of kols list
                        break

                if ';' not in kols_list[
                        index]:  # this line contain just id, not name;url
                    kol_name = 'unknown_id_' + kols_list[index]
                    crawl_url = kols_list[index].strip()  # profile id
                else:
                    kol_name = kols_list[index].split(';')[0]
                    crawl_url = kols_list[index].split(';')[1]

                webconfig = WebConfig()
                webconfig.load_default_config(
                    'facebook user',
                    get_independent_os_path(
                        [base_path, 'resources', 'configs', 'newspaper']))
                webconfig.set_webname(kol_name)
                webconfig.set_config('crawl_url', crawl_url)
                webconfig.set_config(
                    'remove_me', True)  # tag for delete when program finish
                # set random fb account to crawl
                if random_fb_account:
                    profile_index = random.randint(0, len(fb_list) - 1)
                    profile = fb_list[profile_index]
                    webconfig.set_config('browser_profile', profile)

                self.add_newspaper(webconfig)
        # print(self._config)
        # crawl kols by smcc
        if crawl_kols_by_smcc:
            # create a 'crawl_type: kols smcc' WebConfig
            webconfig = WebConfig()
            webconfig.load_default_config(
                'facebook user',
                get_independent_os_path(
                    [base_path, 'resources', 'configs', 'newspaper']))
            webconfig.set_config('crawl_type', 'kols smcc')
            webconfig.set_config('remove_me', True)
            webconfig.set_config('timezone', 'UTC')
            webconfig.set_webname('kol posts')
            webconfig.set_config('minimum_duration_between_crawls', -5)

            self.add_newspaper(webconfig)

        # append newspaper list
        if crawl_newspaper:
            for newspaper in newspaper_list:
                self.add_newspaper(newspaper, beginning=True)
Пример #3
0
 def load_config_from_file(self, filepath):
     with open_utf8_file_to_read(filepath) as stream:
         self._web = yaml.full_load(stream)[0]