def __init__(self, kvstore_host):
        log.info("Initializing KeyValueStoreClient")

        self.rm = ResourceManager()
        self.KV_STORE_HOST = kvstore_host
        self.KV_STORE_DB_PATH = 'mysql+pymysql://root:P@ssword123@{}:3306/kvstore'.format(
            kvstore_host)
        self.db = DataBaseHandler(self.KV_STORE_DB_PATH)

        log.info("Initializing KeyValueStoreClient is successful")
Exemplo n.º 2
0
    def test_collects_only_requested_number_of_pages_of_friends(self):

        shutil.copyfile("seed_with_lots_of_friends.csv", "seeds.csv")

        with open("config.yml", "w") as f:
            yaml.dump(mysql_cfg, f, default_flow_style=False)

        try:
            response = str(check_output('python start.py -n 1 -t -p 1',
                                        stderr=STDOUT, shell=True))
            print(response)
        except CalledProcessError as e:
            response = str(e.output)
            print(response)
            raise e

        dbh = DataBaseHandler()

        result = pd.read_sql("SELECT COUNT(*) FROM friends WHERE source = 2343198944", dbh.engine)

        result = result['COUNT(*)'][0]

        self.assertLessEqual(result, 5000)
        self.assertGreater(result, 4000)

        dbh.engine.execute("DROP TABLE friends, user_details, result;")
Exemplo n.º 3
0
    def test_starting_collectors_and_writing_to_db(self):

        shutil.copyfile("seeds_test.csv", "seeds.csv")

        with open("config.yml", "w") as f:
            yaml.dump(mysql_cfg, f, default_flow_style=False)

        try:
            response = str(check_output('python start.py -n 2 -t -p 1',
                                        stderr=STDOUT, shell=True))
            print(response)
        except CalledProcessError as e:
            response = str(e.output)
            print(response)
            raise e

        dbh = DataBaseHandler()

        result = pd.read_sql("result", dbh.engine)

        self.assertLessEqual(len(result), 8)

        self.assertNotIn(True, result.duplicated().values)

        dbh.engine.execute("DROP TABLE friends, user_details, result;")
Exemplo n.º 4
0
    def test_starts_and_checks_for_necessary_input_config_missing(self):
        # user starts program with `start.py`
        if not os.path.exists("seeds.csv"):
            shutil.copyfile("seeds.csv.bak", "seeds.csv")
        try:
            response = str(check_output('python start.py', stderr=STDOUT,
                                        shell=True), encoding="ascii")

        # ... and encounters an error because:
        except CalledProcessError as e:
            response = str(e.output)
            # ... the config.yml is missing. Ergo the user creates a new one using make_config.py
            self.assertIn("provide a config.yml", response)
            if "provide a config.yml" in response:
                # Does make_config.py not make a new config.yml when entered "n"?
                p = Popen("python make_config.py", stdout=PIPE, stderr=PIPE, stdin=PIPE,
                          shell=True)
                p.communicate("n\n".encode())
                self.assertFalse(os.path.isfile("config.yml"))

                # Does make_config.py open a dialogue asking to open the new config.yaml?
                p = Popen("python make_config.py", stdout=PIPE, stderr=PIPE, stdin=PIPE,
                          shell=True)
                p.communicate("y\n".encode())

            self.assertTrue(os.path.exists("config.yml"))

            with open("config.yml", "w") as f:
                yaml.dump(mysql_cfg, f, default_flow_style=False)

            DataBaseHandler().engine.execute("DROP TABLES friends, user_details, result;")
Exemplo n.º 5
0
    def __init__(self,
                 seeds=2,
                 token_file_name="tokens.csv",
                 seed_list=None,
                 following_pages_limit=0):

        # Get seeds from seeds.csv
        self.seed_pool = FileImport().read_seed_file()

        # Create seed_list if none is given by sampling from the seed_pool
        if seed_list is None:

            self.number_of_seeds = seeds
            try:
                self.seeds = self.seed_pool.sample(n=self.number_of_seeds)
            except ValueError:  # seed pool too small
                stderr.write(
                    "WARNING: Seed pool smaller than number of seeds.\n")
                self.seeds = self.seed_pool.sample(n=self.number_of_seeds,
                                                   replace=True)

            self.seeds = self.seeds[0].values
        else:
            self.number_of_seeds = len(seed_list)
            self.seeds = seed_list

        self.seed_queue = mp.Queue()

        for seed in self.seeds:
            self.seed_queue.put(seed)

        # Get authorized user tokens for app from tokens.csv
        self.tokens = FileImport().read_token_file(token_file_name)

        # and put them in a queue
        self.token_queue = mp.Queue()

        for token, secret in self.tokens.values:
            self.token_queue.put((token, secret, {}, {}))

        # Initialize DataBaseHandler for DB communication
        self.dbh = DataBaseHandler()
        self.following_pages_limit = following_pages_limit
Exemplo n.º 6
0
    def tearDown(self):
        if os.path.isfile("config.yml.bak"):
            os.replace("config.yml.bak", "config.yml")
        if os.path.isfile("seeds.csv"):
            os.remove("seeds.csv")

        dbh = DataBaseHandler(config_dict=mysql_cfg, create_all=False)

        try:
            dbh.engine.execute("DROP TABLE friends")
        except InternalError:
            pass
        try:
            dbh.engine.execute("DROP TABLE user_details")
        except InternalError:
            pass
        try:
            dbh.engine.execute("DROP TABLE result")
        except InternalError:
            pass
Exemplo n.º 7
0
    def test_restarts_after_exception(self):

        shutil.copyfile("two_seeds.csv", "seeds.csv")

        with open("config.yml", "w") as f:
            yaml.dump(mysql_cfg, f, default_flow_style=False)

        with self.assertRaises(TestException):
            main_loop(Coordinator(), test_fail=True)

        p = Popen("python start.py -n 2 -t -f -p 1", stdout=PIPE, stderr=PIPE, stdin=PIPE,
                  shell=True)

        stdout, stderr = p.communicate()

        self.assertIn("Retrying", stdout.decode('utf-8'))  # tries to restart
        self.assertIn("Sent notification to", stdout.decode('utf-8'))

        latest_seeds = set(pd.read_csv("latest_seeds.csv", header=None)[0].values)
        seeds = set(pd.read_csv('seeds.csv', header=None)[0].values)

        self.assertEqual(latest_seeds, seeds)

        q = Popen("python start.py -t --restart -p 1", stdout=PIPE, stderr=PIPE, stdin=PIPE,
                  shell=True)

        stdout, stderr = q.communicate()

        self.assertIn("Restarting with latest seeds:", stdout.decode('utf-8'),
                      msg=f"{stdout.decode('utf-8')}\n{stderr.decode('utf-8')}")

        latest_seeds = set(pd.read_csv("latest_seeds.csv", header=None)[0].values)

        self.assertNotEqual(latest_seeds, seeds)

        DataBaseHandler().engine.execute("DROP TABLE friends, user_details, result;")
class KeyValueStoreClient:
    def __init__(self, kvstore_host):
        log.info("Initializing KeyValueStoreClient")

        self.rm = ResourceManager()
        self.KV_STORE_HOST = kvstore_host
        self.KV_STORE_DB_PATH = 'mysql+pymysql://root:P@ssword123@{}:3306/kvstore'.format(
            kvstore_host)
        self.db = DataBaseHandler(self.KV_STORE_DB_PATH)

        log.info("Initializing KeyValueStoreClient is successful")

    '''
        Upload a bytes object as document
    '''

    def upload_bytes(self, dir_id, doc_id, bytez, stub=None):
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                self.__uploadbytes(dir_id, doc_id, bytez, stub)
                channel.unsubscribe(self.close)
        else:
            self.__uploadbytes(dir_id, doc_id, bytez, stub)

    '''
        Reads and returns the document as bytes() object
    '''

    def read_bytes(self, dir_id, doc_id, stub=None):
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                bytez = self.__readbytes(dir_id, doc_id, stub)
                channel.unsubscribe(self.close)
        else:
            bytez = self.__readbytes(dir_id, doc_id, stub)
        return bytez

    '''
        Reads a text file and saves it in key-value store
    '''

    def upload_file(self, dir_id, filepath, stub=None):
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                self.__uploadfile(dir_id, filepath, stub)
                channel.unsubscribe(self.close)
        else:
            self.__uploadfile(dir_id, filepath, stub)

    '''
        Upload a string as a document
    '''

    def upload_file_str(self, dir_id, doc_id, string, stub=None):
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                self.__uploadfilestr(dir_id, doc_id, string, stub)
                channel.unsubscribe(self.close)
        else:
            self.__uploadfilestr(dir_id, doc_id, string, stub)

    '''
        Upload an entire directory
    '''

    def upload_directory(self, dirpath, stub=None):
        log.info("Uploading directory {}".format(dirpath))
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                dir_id = self.__uploaddirectory(dirpath, stub)
                channel.unsubscribe(self.close)
        else:
            dir_id = self.__uploaddirectory(dirpath, stub)

        log.info("Uploading directory {} is successful. dir_id: {}".format(
            dirpath, dir_id))
        return dir_id

    '''
        Saves all the files associated with dir_id at save_path
    '''

    def download_directory(self, dir_id, save_path, flatten=False, stub=None):
        log.info("Downloading directory {} at {}".format(dir_id, save_path))
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                self.__downloaddirectory(dir_id, save_path, flatten, stub)
                channel.unsubscribe(self.close)
        else:
            self.__downloaddirectory(dir_id, save_path, flatten, stub)

        log.info("Downloading directory {} at {} is successful".format(
            dir_id, save_path))

    '''
        Downloads single file from key-value store
    '''

    def download_file(self, dir_id, doc_id, root, flatten=False, stub=None):
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                self.__downloadfile(dir_id, doc_id, root, flatten, stub)
        else:
            self.__downloadfile(dir_id, doc_id, root, flatten, stub)

    '''
        Downloads a chunk given chunk_id and saves it as file save_path
    '''

    def download_chunk(self, chunk_id, save_path, stub=None):
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                data = self.__readchunk(chunk_id, stub)
        else:
            data = self.__readchunk(chunk_id, stub)

        if data:
            if not os.path.exists(os.path.dirname(save_path)):
                os.makedirs(os.path.dirname(save_path))

            data = data.decode(KV_STORE_ENCODING)
            with open(save_path, 'w') as f:
                f.write(data)

    '''
        Reads and returns chunk content as a string
    '''

    def read_chunk(self, chunk_id, stub=None):
        if not stub:
            with grpc.insecure_channel("{}:{}".format(
                    self.KV_STORE_HOST, KV_STORE_PORT)) as channel:
                stub = kvstore_pb2_grpc.KeyValueStoreStub(channel)
                return self.__readchunk(chunk_id, stub)
        else:
            return self.__readchunk(chunk_id, stub)

    '''
        Returns all unique doc_ids associated with directory 
    '''

    def get_doc_metadata(self, dir_id):
        return self.db.get_doc_metadata(dir_id)

    '''
        Returns all the chunks associated with given dir_id and optionally doc_id
    '''

    def get_chunk_metadata(self, dir_id, doc_id=None):
        return self.db.get_chunk_metadata(dir_id, doc_id)

    ''' Private, helper methods '''
    '''
        Saves the block of bytes in key-values store and records it in the database
    '''

    def __uploadblock(self, dir_id, doc_id, chunk_index, chunk_id, bytez,
                      stub):
        data_block = kvstore_pb2.DataBlock(key=chunk_id, value=bytez)

        for _ in range(3):
            save_status = stub.Save(data_block)
            if save_status.status == 'success':
                break

        # save to database
        self.db.save_chunk(dir_id, doc_id, chunk_index, chunk_id)

    def __readbytes(self, dir_id, doc_id, stub):
        chunks = self.get_chunk_metadata(dir_id, doc_id)
        bytez = bytes()
        for chunk in chunks:
            data = self.__readchunk(chunk[3], stub)
            bytez += data
        return bytez

    def __uploadbytes(self, dir_id, doc_id, bytez, stub):
        for chunk_index, start in enumerate(
                range(0, len(bytez), KV_STORE_BLOCK_SIZE)):
            block = bytez[start:min(len(bytez), start + KV_STORE_BLOCK_SIZE)]
            self.__uploadblock(dir_id, doc_id, chunk_index, generateId(),
                               block, stub)

    def __uploadfile(self, dir_id, filepath, stub):
        for chunk_index, chunk in file_iterator(filepath):
            self.__uploadblock(dir_id, filepath, chunk_index, generateId(),
                               chunk.encode(KV_STORE_ENCODING), stub)

    def __uploadfilestr(self, dir_id, doc_id, string, stub):
        for chunk_index, chunk in str_iterator(string):
            self.__uploadblock(dir_id, doc_id, chunk_index, generateId(),
                               chunk.encode(KV_STORE_ENCODING), stub)

    def __uploaddirectory(self, dirpath, stub):
        # create an id to keep all files in the directory together on the server
        # all the files can be accessed using this id
        dir_id = generateId()

        # recursively upload all the files in the given directory
        for r, d, f in os.walk(dirpath):
            for file in f:
                filepath = os.path.join(r, file)
                self.__uploadfile(dir_id, filepath, stub)
        return dir_id

    def __readchunk(self, chunk_id, stub):
        return stub.Get(kvstore_pb2.Id(id=chunk_id)).value

    def __downloadfile(self, dir_id, doc_id, root, flatten, stub):
        save_path = os.path.join(
            root,
            os.path.basename(doc_id) if flatten else doc_id)

        if not os.path.exists(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path))

        with open(save_path, 'w') as f:
            chunks = self.get_chunk_metadata(dir_id, doc_id)
            for chunk in chunks:
                data = self.__readchunk(chunk[3],
                                        stub).decode(KV_STORE_ENCODING)
                f.write(data)

    def __downloaddirectory(self, dir_id, save_path, flatten, stub):
        docs = self.get_doc_metadata(dir_id)
        for doc_id in docs:
            self.download_file(dir_id, doc_id[0], save_path, flatten, stub)

    def close(self, channel):
        channel.close()
Exemplo n.º 9
0
class Coordinator(object):
    """Selects a queue of seeds and coordinates the collection with collectors
    and a queue of tokens.
    """
    def __init__(self,
                 seeds=2,
                 token_file_name="tokens.csv",
                 seed_list=None,
                 following_pages_limit=0):

        # Get seeds from seeds.csv
        self.seed_pool = FileImport().read_seed_file()

        # Create seed_list if none is given by sampling from the seed_pool
        if seed_list is None:

            self.number_of_seeds = seeds
            try:
                self.seeds = self.seed_pool.sample(n=self.number_of_seeds)
            except ValueError:  # seed pool too small
                stderr.write(
                    "WARNING: Seed pool smaller than number of seeds.\n")
                self.seeds = self.seed_pool.sample(n=self.number_of_seeds,
                                                   replace=True)

            self.seeds = self.seeds[0].values
        else:
            self.number_of_seeds = len(seed_list)
            self.seeds = seed_list

        self.seed_queue = mp.Queue()

        for seed in self.seeds:
            self.seed_queue.put(seed)

        # Get authorized user tokens for app from tokens.csv
        self.tokens = FileImport().read_token_file(token_file_name)

        # and put them in a queue
        self.token_queue = mp.Queue()

        for token, secret in self.tokens.values:
            self.token_queue.put((token, secret, {}, {}))

        # Initialize DataBaseHandler for DB communication
        self.dbh = DataBaseHandler()
        self.following_pages_limit = following_pages_limit

    def bootstrap_seed_pool(self, after_timestamp=0):
        """Adds all collected user details, i.e. friends with the desired properties
        (e.g. language) of previously found seeds to the seed pool.

        Args:
            after_timestamp (int): filter for friends added after this timestamp. Default: 0
        Returns:
            None
        """

        seed_pool_size = len(self.seed_pool)
        stdout.write("Bootstrapping seeds.\n")
        stdout.write(
            f"Old size: {seed_pool_size}. Adding after {after_timestamp} ")
        stdout.flush()

        query = f"SELECT id FROM user_details WHERE UNIX_TIMESTAMP(timestamp) >= {after_timestamp}"

        more_seeds = pd.read_sql(query, self.dbh.engine)
        more_seeds.columns = [0]  # rename from id to 0 for proper append
        self.seed_pool = self.seed_pool.merge(more_seeds, how='outer', on=[0])

        seed_pool_size = len(self.seed_pool)
        stdout.write(f"New size: {seed_pool_size}\n")
        stdout.flush()

    def lookup_accounts_friend_details(self,
                                       account_id,
                                       db_connection=None,
                                       select="*"):
        """Looks up and retrieves details from friends of `account_id` via database.

        Args:
            account_id (int)
            db_connection (database connection/engine object)
            select (str): comma separated list of required fields, defaults to all available ("*")
        Returns:
            None, if no friends found.
            Otherwise DataFrame with all details. Might be empty if language filter is on.
        """

        if db_connection is None:
            db_connection = self.dbh.engine

        query = f"SELECT target from friends WHERE source = {account_id} AND burned = 0"
        friends = pd.read_sql(query, db_connection)

        if len(friends) == 0:
            return None
        else:
            friends = friends['target'].values
            friends = tuple(friends)
            if len(friends) == 1:
                friends = str(friends).replace(',', '')

            query = f"SELECT {select} from user_details WHERE id IN {friends}"
            friend_detail = pd.read_sql(query, db_connection)

            return friend_detail

    def choose_random_new_seed(self, msg, connection):
        new_seed = self.seed_pool.sample(n=1)
        new_seed = new_seed[0].values[0]

        if msg is not None:
            stdout.write(msg + "\n")
            stdout.flush()

        self.token_queue.put(
            (connection.token, connection.secret, connection.reset_time_dict,
             connection.calls_dict))

        self.seed_queue.put(new_seed)

        return new_seed

    def write_user_details(self, user_details):
        """Writes pandas.DataFrame `user_details` to MySQL table 'user_details'
        """

        try:
            user_details.to_sql('user_details',
                                if_exists='append',
                                index=False,
                                con=self.dbh.engine)

        except IntegrityError:  # duplicate id (primary key)
            temp_tbl_name = self.dbh.make_temp_tbl()
            user_details.to_sql(temp_tbl_name,
                                if_exists="append",
                                index=False,
                                con=self.dbh.engine)
            query = "REPLACE INTO user_details SELECT * FROM {};".format(
                temp_tbl_name)
            self.dbh.engine.execute(query)
            self.dbh.engine.execute("DROP TABLE " + temp_tbl_name + ";")

    @retry_x_times(10)
    def work_through_seed_get_next_seed(self,
                                        seed,
                                        select=[],
                                        status_lang=None,
                                        connection=None,
                                        fail=False,
                                        **kwargs):
        """Takes a seed and determines the next seed and saves all details collected to db.

        Args:
            seed (int)
            select (list of str): fields to save to database, defaults to all
            status_lang (str): Twitter language code for language of last status to filter for,
                defaults to None
            connection (collector.Connection object)
        Returns:
            seed (int)
        """

        # For testing raise of errors while multithreading
        if fail is True:
            raise TestException

        if 'fail_hidden' in kwargs and kwargs['fail_hidden'] is True:
            raise TestException

        language_check_condition = (status_lang is not None
                                    and 'language_threshold' in kwargs
                                    and kwargs['language_threshold'] > 0)

        keyword_condition = ('keywords' in kwargs
                             and kwargs['keywords'] is not None
                             and len(kwargs['keywords']) > 0)

        if connection is None:
            connection = Connection(token_queue=self.token_queue)

        friends_details = None
        if 'restart' in kwargs and kwargs['restart'] is True:
            print("No db lookup after restart allowed, accessing Twitter API.")
        else:
            try:
                friends_details = self.lookup_accounts_friend_details(
                    seed, self.dbh.engine)

            except ProgrammingError:

                print(
                    """Accessing db for friends_details failed. Maybe database does not exist yet.
                Accessing Twitter API.""")

        if friends_details is None:
            if 'restart' in kwargs and kwargs['restart'] is True:
                pass
            elif language_check_condition or keyword_condition:
                check_exists_query = f"""
                                        SELECT EXISTS(
                                            SELECT source FROM result
                                            WHERE source={seed}
                                            )
                                     """
                seed_depleted = self.dbh.engine.execute(
                    check_exists_query).scalar()

                if seed_depleted == 1:
                    new_seed = self.choose_random_new_seed(
                        f'Seed {seed} is depleted. No friends meet conditions. Random new seed.',
                        connection)

                    return new_seed

            collector = Collector(
                connection,
                seed,
                following_pages_limit=self.following_pages_limit)

            try:
                friend_list = collector.get_friend_list()
                if 'bootstrap' in kwargs and kwargs['bootstrap'] is True:
                    follower_list = collector.get_friend_list(follower=True)
            except tweepy.error.TweepError as e:  # if account is protected
                if "Not authorized." in e.reason:

                    new_seed = self.choose_random_new_seed(
                        "Account {} protected, selecting random seed.".format(
                            seed), connection)

                    return new_seed

                elif "does not exist" in e.reason:

                    new_seed = self.choose_random_new_seed(
                        f"Account {seed} does not exist. Selecting random seed.",
                        connection)

                    return new_seed

                else:
                    raise e

            if friend_list == []:  # if account follows nobody

                new_seed = self.choose_random_new_seed(
                    "No friends or unburned connections left, selecting random seed.",
                    connection)

                return new_seed

            self.dbh.write_friends(seed, friend_list)

            friends_details = collector.get_details(friend_list)
            select = list(
                set(select + [
                    "id", "followers_count", "status_lang", "created_at",
                    "statuses_count"
                ]))
            friends_details = Collector.make_friend_df(friends_details, select)

            if 'bootstrap' in kwargs and kwargs['bootstrap'] is True:
                follower_details = collector.get_details(follower_list)
                follower_details = Collector.make_friend_df(
                    follower_details, select)

            if status_lang is not None:

                if type(status_lang) is str:
                    status_lang = [status_lang]
                friends_details = friends_details[
                    friends_details['status_lang'].isin(status_lang)]

                if 'bootstrap' in kwargs and kwargs['bootstrap'] is True:
                    follower_details = follower_details[
                        follower_details['status_lang'].isin(status_lang)]

                if len(friends_details) == 0:

                    new_seed = self.choose_random_new_seed(
                        f"No friends found with language '{status_lang}', selecting random seed.",
                        connection)

                    return new_seed

            self.write_user_details(friends_details)

            if 'bootstrap' in kwargs and kwargs['bootstrap'] is True:
                self.write_user_details(follower_details)

        if status_lang is not None and len(friends_details) == 0:

            new_seed = self.seed_pool.sample(n=1)
            new_seed = new_seed[0].values[0]

            stdout.write(
                "No user details for friends with last status language '{}' found in db.\n"
                .format(status_lang))
            stdout.flush()

            self.token_queue.put(
                (connection.token, connection.secret,
                 connection.reset_time_dict, connection.calls_dict))

            self.seed_queue.put(new_seed)

            return new_seed

        if 'restart' in kwargs and kwargs['restart'] is True:
            #  lookup just in case we had them already
            friends_details_db = self.lookup_accounts_friend_details(
                seed, self.dbh.engine)
            if friends_details_db is not None and len(friends_details_db) > 0:
                friends_details = friends_details_db

        double_burned = True

        while double_burned is True:
            max_follower_count = friends_details['followers_count'].max()

            new_seed = friends_details[friends_details['followers_count'] ==
                                       max_follower_count]['id'].values[0]

            while language_check_condition or keyword_condition:
                # RETRIEVE AND TEST MORE TWEETS FOR LANGUAGE OR KEYWORDS
                try:
                    latest_tweets = get_latest_tweets(
                        new_seed, connection, fields=['lang', 'full_text'])
                except tweepy.error.TweepError as e:  # if account is protected
                    if "Not authorized." in e.reason:
                        new_seed = self.choose_random_new_seed(
                            f"Account {new_seed} protected, selecting random seed.",
                            connection)

                        return new_seed
                    elif "does not exist" in e.reason:
                        new_seed = self.choose_random_new_seed(
                            f"Account {seed} does not exist. Selecting random seed.",
                            connection)

                        return new_seed
                    else:
                        raise e

                threshold_met = True  # set true per default and change to False if not met
                keyword_met = True

                if language_check_condition:
                    language_fractions = get_fraction_of_tweets_in_language(
                        latest_tweets)

                    threshold_met = any(
                        kwargs['language_threshold'] <= fraction
                        for fraction in language_fractions.values())

                if keyword_condition:
                    keyword_met = any(latest_tweets['full_text'].str.contains(
                        keyword, case=False).any()
                                      for keyword in kwargs['keywords'])

                # THEN REMOVE FROM friends_details DATAFRAME, SEED POOL,
                # AND DATABASE IF FALSE POSITIVE
                # ACCORDING TO THRESHOLD OR KEYWORD

                if threshold_met and keyword_met:
                    break
                else:
                    friends_details = friends_details[
                        friends_details['id'] != new_seed]

                    print(
                        f'seed pool size before removing not matching seed: {len(self.seed_pool)}'
                    )
                    self.seed_pool = self.seed_pool[
                        self.seed_pool[0] != new_seed]
                    print(
                        f'seed pool size after removing not matching seed: {len(self.seed_pool)}'
                    )

                    # query = f"DELETE from user_details WHERE id = {new_seed}"
                    # self.dbh.engine.execute(query)

                    query = f"DELETE from friends WHERE target = {new_seed}"
                    self.dbh.engine.execute(query)

                    # AND REPEAT THE CHECK
                    try:
                        new_seed = friends_details[
                            friends_details['followers_count'] ==
                            max_follower_count]['id'].values[0]
                    except IndexError:  # no more friends
                        new_seed = self.choose_random_new_seed(
                            f'{seed}: No friends meet set conditions. Selecting random.',
                            connection)

                        return new_seed

            check_exists_query = """
                                    SELECT EXISTS(
                                        SELECT * FROM friends
                                        WHERE source={source}
                                        )
                                 """.format(source=new_seed)
            node_exists_as_source = self.dbh.engine.execute(
                check_exists_query).scalar()

            if node_exists_as_source == 1:
                check_follow_query = """
                                        SELECT EXISTS(
                                            SELECT * FROM friends
                                            WHERE source={source} and target={target}
                                            )
                                     """.format(source=new_seed, target=seed)

                follows = self.dbh.engine.execute(check_follow_query).scalar()

            elif node_exists_as_source == 0:
                # check on Twitter

                # FIXTHIS: dirty workaround because of wacky test
                if connection == "fail":
                    connection = Connection()

                try:
                    collector
                except NameError:
                    collector = Collector(connection, seed)

                try:
                    follows = int(
                        collector.check_follows(source=new_seed, target=seed))
                except tweepy.TweepError:
                    print(
                        f"Follow back undetermined. User {new_seed} not available"
                    )
                    follows = 0

            if follows == 0:

                insert_query = f"""
                    INSERT INTO result (source, target)
                    VALUES ({seed}, {new_seed})
                    ON DUPLICATE KEY UPDATE source = source
                """

                self.dbh.engine.execute(insert_query)

                print('\nno follow back: added ({seed})-->({new_seed})'.format(
                    seed=seed, new_seed=new_seed))

            if follows == 1:

                insert_query = f"""
                    INSERT INTO result (source, target)
                    VALUES
                        ({seed}, {new_seed}),
                        ({new_seed}, {seed})
                    ON DUPLICATE KEY UPDATE source = source
                """

                self.dbh.engine.execute(insert_query)

                print('\nfollow back: added ({seed})<-->({new_seed})'.format(
                    seed=seed, new_seed=new_seed))

            update_query = """
                            UPDATE friends
                            SET burned=1
                            WHERE source={source} AND target={target} AND burned = 0
                           """.format(source=seed, target=new_seed)

            update_result = self.dbh.engine.execute(update_query)

            if update_result.rowcount == 0:
                print(
                    f"Connection ({seed})-->({new_seed}) was burned already.")
                friends_details = self.lookup_accounts_friend_details(
                    seed, self.dbh.engine)

                if friends_details is None or len(friends_details) == 0:
                    new_seed = self.choose_random_new_seed(
                        f"No friends or unburned connections left for {seed}, selecting random.",
                        connection)

                    return new_seed

            else:
                print(f"burned ({seed})-->({new_seed})")
                double_burned = False

        self.token_queue.put(
            (connection.token, connection.secret, connection.reset_time_dict,
             connection.calls_dict))

        self.seed_queue.put(new_seed)

        return new_seed

    def start_collectors(self,
                         number_of_seeds=None,
                         select=[],
                         status_lang=None,
                         fail=False,
                         fail_hidden=False,
                         restart=False,
                         retries=10,
                         bootstrap=False,
                         latest_start_time=0,
                         language_threshold=0,
                         keywords=[]):
        """Starts `number_of_seeds` collector threads
        collecting the next seed for on seed taken from `self.queue`
        and puting it back into `self.seed_queue`.

        Args:
            number_of_seeds (int): Defaults to `self.number_of_seeds`
            select (list of strings): fields to save to user_details table in database
            status_lang (str): language code for latest tweet langage to select
        Returns:
            list of mp.(dummy.)Process
        """

        if bootstrap is True:

            if restart is True:
                latest_start_time = 0

            self.bootstrap_seed_pool(after_timestamp=latest_start_time)

        if number_of_seeds is None:
            number_of_seeds = self.number_of_seeds

        processes = []
        seed_list = []

        print("number of seeds: ", number_of_seeds)

        for i in range(number_of_seeds):
            seed = self.seed_queue.get()
            seed_list += [seed]
            print("seed ", i, ": ", seed)
            processes.append(
                MyProcess(target=self.work_through_seed_get_next_seed,
                          kwargs={
                              'seed': seed,
                              'select': select,
                              'status_lang': status_lang,
                              'fail': fail,
                              'fail_hidden': fail_hidden,
                              'restart': restart,
                              'retries': retries,
                              'language_threshold': language_threshold,
                              'bootstrap': bootstrap,
                              'keywords': keywords
                          },
                          name=str(seed)))

        latest_seeds = pd.DataFrame(seed_list)

        latest_seeds.to_csv('latest_seeds.csv', index=False, header=False)

        for p in processes:
            p.start()
            print(f"Thread {p.name} started.")

        return processes