示例#1
0
	def get_post_by_id(self, blog_name, post_id):
		"""
		Fetch individual posts
		:param blog_name, str: The blog's name
		:param id, int: The post ID

		returns result list, a list with a dictionary with the post's information
		"""
		if self.interrupted:
			raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr")

		client = self.connect_to_tumblr()

		# Request the specific post.
		post = client.posts(blog_name, id=post_id)

		# Tumblr API can sometimes return with this kind of error:
		# {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}}
		if "posts" not in post:
			return None

		# Get the first element of the list - it's always one post.
		result = post["posts"][0]

		return result
示例#2
0
	def gather_posts(self, client, queries, max_items, userinfo):
		"""
		Gather messages for each entity for which messages are requested

		:param TelegramClient client:  Telegram Client
		:param list queries:  List of entities to query (as string)
		:param int max_items:  Messages to scrape per entity
		:param bool userinfo:  Whether to scrape detailed user information
		rather than just the ID
		:return list:  List of messages, each message a dictionary.
		"""
		posts = []
		for query in queries:
			query_posts = []
			i = 0
			try:
				for message in client.iter_messages(entity=query):
					if self.interrupted:
						raise ProcessorInterruptedException("Interrupted while fetching message data from the Telegram API")

					if i % 500 == 0:
						self.dataset.update_status("Retrieved %i posts for entity '%s'" % (len(query_posts) + len(posts), query))
					parsed_message = self.import_message(client, message, query, get_full_userinfo=userinfo)
					query_posts.append(parsed_message)

					i += 1
					if i > max_items:
						break
			except ValueError as e:
				self.dataset.update_status("Could not scrape entity '%s'" % query)

			posts += list(reversed(query_posts))

		return posts
示例#3
0
    def process(self):
        """
		This takes a CSV file as input and writes the same data as a JSON file
		"""
        posts = 0
        self.dataset.update_status("Converting posts")

        # we write to file per row, instead of json.dumps()ing all of it at
        # once, since else we risk having to keep a lot of data in memory,
        # and this buffers one row at most
        with self.dataset.get_results_path().open("w") as output:
            output.write("[")
            for post in self.iterate_items(self.source_file):
                # stop processing if worker has been asked to stop
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing CSV file")

                posts += 1

                if posts > 1:
                    output.write(",")

                output.write(json.dumps(post))
            output.write("]")

        self.dataset.update_status("Finished.")
        self.dataset.finish(num_rows=posts)
示例#4
0
	def call_penelope_api(self, endpoint, *args, **kwargs):
		"""
		Call PENELOPE API and don't crash (immediately) if it fails

		:param endpoint: Endpoint to call relative to HTTP root
		:param args:
		:param kwargs:
		:return: Response, or `None`
		"""
		retries = 0
		while retries < self.max_retries:
			if self.interrupted:
				raise ProcessorInterruptedException("Interrupted while fetching data from the Penelope API")

			try:
				url = "http://penelope.vub.be/guardian-climate-change-data/" + endpoint
				response = requests.get(url, *args, **kwargs)
				break
			except requests.RequestException as e:
				self.log.info("Error %s while querying PENELOPE Guardian API - retrying..." % e)
				retries += 1

		if retries >= self.max_retries:
			self.log.error("Error during PENELOPE fetch of query %s" % self.dataset.key)
			self.dataset.update_status("Error while searching for posts on PENELOPE Guardian API")
			return None
		else:
			return response.json()
示例#5
0
	def fetch_posts(self, post_ids, where=None, replacements=None, groups=None):
		"""
		Fetch post data from database

		:param list post_ids:  List of post IDs to return data for
		:return list: List of posts, with a dictionary representing the database record for each post
		"""
		if not where:
			where = []

		if not replacements:
			replacements = []

		columns = ", ".join(self.return_cols)
		where.append("id IN %s")
		replacements.append(post_ids)

		if self.interrupted:
			raise ProcessorInterruptedException("Interrupted while fetching post data")

		if groups:
			where.append("id IN ( SELECT post_id FROM groups_" + self.prefix + " WHERE \"group\" LIKE ANY(%s) )")
			replacements.append(groups)

		query = "SELECT " + columns + " FROM posts_" + self.prefix + " WHERE " + " AND ".join(
			where) + " ORDER BY id ASC"
		return self.db.fetchall_interruptable(self.queue, query, replacements)
示例#6
0
文件: search.py 项目: p-charis/4cat
    def items_to_ndjson(self, items, filepath):
        """
		Save retrieved items as an ndjson file

		NDJSON is a file with one valid JSON value per line, in this case each
		of these JSON values represents a retrieved item. This is useful if the
		retrieved data cannot easily be completely stored as a flat CSV file
		and we want to leave the choice of how to flatten it to the user. Note
		that no conversion (e.g. html stripping or pseudonymisation) is done
		here - the items are saved as-is.

		:param Iterator items:  Items to save
		:param Path filepath:  Location to save results file
		"""
        if not filepath:
            raise ResourceWarning("No valid results path supplied")

        processed = 0
        with filepath.open("w", encoding="utf-8", newline="") as outfile:
            for item in items:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results to file")

                outfile.write(json.dumps(item) + "\n")
                processed += 1

        return processed
示例#7
0
    def process(self):
        """
        This takes a Twitter NDJSON file to be importable as a JSON file by TCAT's import-jsondump.php
        """
        posts = 0
        self.dataset.update_status("Converting posts")

        # This handles and writes one Tweet at a time
        with self.dataset.get_results_path().open("w") as output:
            for post in self.iterate_items(self.source_file, bypass_map_item=True):
                # stop processing if worker has been asked to stop
                if self.interrupted:
                    raise ProcessorInterruptedException("Interrupted while processing NDJSON file")

                posts += 1

                post = self.map_to_TCAT(post)

                # TCAT has a check on line 62 of /import/import-jsondump.php
                # that rejects strings large than 40960
                #https://github.com/digitalmethodsinitiative/dmi-tcat/blob/9654fe3ff489fd3b0efc6ddcf7c19adf8ed7726d/import/import-jsondump.php#L62
                # We are obviously dropping some tweets because of this
                if len(json.dumps(post)) < 40960:
                    output.write(json.dumps(post, ensure_ascii=False))
                    # NDJSON file is expected by TCAT
                    output.write('\n')

        self.dataset.update_status("Finished.")
        self.dataset.finish(num_rows=posts)
示例#8
0
	def fetch_sphinx(self, where, replacements):
		"""
		Query Sphinx for matching post IDs

		:param str where:  Drop-in WHERE clause (without the WHERE keyword) for the Sphinx query
		:param list replacements:  Values to use for parameters in the WHERE clause that should be parsed
		:return list:  List of matching posts; each post as a dictionary with `thread_id` and `post_id` as keys
		"""

		# if a Sphinx query is interrupted, pymysql will not actually raise an
		# exception but just a warning. But we need to detect interruption, so here we
		# make sure pymysql warnings are converted to exceptions
		warnings.filterwarnings("error", module=".*pymysql.*")

		sphinx_start = time.time()
		sphinx = self.get_sphinx_handler()

		results = []
		try:
			sql = "SELECT thread_id, post_id FROM `" + self.prefix + "_posts` WHERE " + where + " LIMIT 5000000 OPTION max_matches = 5000000, ranker = none, boolean_simplify = 1, sort_method = kbuffer, cutoff = 5000000"
			parsed_query = sphinx.mogrify(sql, replacements)
			self.log.info("Running Sphinx query %s " % parsed_query)
			self.running_query = parsed_query
			results = sphinx.fetchall(parsed_query, [])
			sphinx.close()
		except SphinxWarning as e:
			# this is a pymysql warning converted to an exception
			if "query was killed" in str(e):
				self.dataset.update_status("Search was interruped and will restart later")
				raise ProcessorInterruptedException("Interrupted while running Sphinx query")
			else:
				self.dataset.update_status("Error while querying full-text search index", is_final=True)
				self.log.error("Sphinx warning: %s" % e)
		except OperationalError as e:
			self.dataset.update_status(
				"Your query timed out. This is likely because it matches too many posts. Try again with a narrower date range or a more specific search query.",
				is_final=True)
			self.log.info("Sphinx query timed out after %i seconds" % (time.time() - sphinx_start))
			return None
		except ProgrammingError as e:
			if "invalid packet size" in str(e) or "query timed out" in str(e):
				self.dataset.update_status(
					"Error during query. Your query matches too many items. Try again with a narrower date range or a more specific search query.",
					is_final=True)
			elif "syntax error" in str(e):
				self.dataset.update_status(
					"Error during query. Your query syntax may be invalid (check for loose parentheses).",
					is_final=True)
			else:
				self.dataset.update_status(
					"Error during query. Please try a narrow query and double-check your syntax.", is_final=True)
				self.log.error("Sphinx crash during query %s: %s" % (self.dataset.key, e))
			return None


		self.log.info("Sphinx query finished in %i seconds, %i results." % (time.time() - sphinx_start, len(results)))
		return results
示例#9
0
文件: search.py 项目: saviaga/4cat
	def process(self):
		"""
		Run 4CAT search query

		Gets query details, passes them on to the object's search method, and
		writes the results to a CSV file. If that all went well, the query and
		job are marked as finished.
		"""

		query_parameters = self.dataset.get_parameters()
		results_file = self.dataset.get_results_path()

		self.log.info("Querying: %s" % str(query_parameters))

		# Execute the relevant query (string-based, random, countryflag-based)
		try:
			posts = self.search(query_parameters)
		except WorkerInterruptedException:
			raise ProcessorInterruptedException("Interrupted while collecting data, trying again later.")

		# Write posts to csv and update the DataBase status to finished
		num_posts = 0
		if posts:
			self.dataset.update_status("Writing posts to result file")
			num_posts = self.posts_to_csv(posts, results_file)
			self.dataset.update_status("Query finished, results are available.")
		elif posts is not None:
			self.dataset.update_status("Query finished, no results found.")

		# queue predefined post-processors
		if num_posts > 0 and query_parameters.get("next", []):
			for next in query_parameters.get("next"):
				next_parameters = next.get("parameters", {})
				next_type = next.get("type", "")
				available_processors = self.dataset.get_available_processors()

				# run it only if the post-processor is actually available for this query
				if next_type in available_processors:
					next_analysis = DataSet(parameters=next_parameters, type=next_type, db=self.db,
											parent=self.dataset.key,
											extension=available_processors[next_type]["extension"])
					self.queue.add_job(next_type, remote_id=next_analysis.key)

		# see if we need to register the result somewhere
		if query_parameters.get("copy_to", None):
			# copy the results to an arbitrary place that was passed
			if self.dataset.get_results_path().exists():
				# but only if we actually have something to copy
				shutil.copyfile(str(self.dataset.get_results_path()), query_parameters.get("copy_to"))
			else:
				# if copy_to was passed, that means it's important that this
				# file exists somewhere, so we create it as an empty file
				with open(query_parameters.get("copy_to"), "w") as empty_file:
					empty_file.write("")

		self.dataset.finish(num_rows=num_posts)
示例#10
0
    def process(self):
        """
        This takes a 4CAT results file as input, and outputs a new CSV file
        with one column with image hashes, one with the first file name used
        for the image, and one with the amount of times the image was used
        """
        api_key = self.parameters.get("api_key")
        self.dataset.delete_parameter("api_key")  # sensitive, delete after use

        features = self.parameters.get("features")
        features = [{"type": feature} for feature in features]

        if not api_key:
            self.dataset.update_status("You need to provide a valid API key",
                                       is_final=True)
            self.dataset.finish(0)
            return

        max_images = convert_to_int(self.parameters.get("amount", 0), 100)
        total = self.source_dataset.num_rows if not max_images else min(
            max_images, self.source_dataset.num_rows)
        done = 0

        for image_file in self.iterate_archive_contents(self.source_file):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching data from Google Vision API")

            done += 1
            self.dataset.update_status("Annotating image %i/%i" %
                                       (done, total))

            try:
                annotations = self.annotate_image(image_file, api_key,
                                                  features)
            except RuntimeError:
                # cannot continue fetching, e.g. when API key is invalid
                break

            if not annotations:
                continue

            annotations = {"file_name": image_file.name, **annotations}

            with self.dataset.get_results_path().open(
                    "a", encoding="utf-8") as outfile:
                outfile.write(json.dumps(annotations) + "\n")

            if max_images and done >= max_images:
                break

        self.dataset.update_status("Annotations retrieved for %i images" %
                                   done)
        self.dataset.finish(done)
示例#11
0
    def tokens_from_file(self, file, staging_area, phraser=None):
        """
		Read tokens from token dump

		If the tokens were saved as JSON, take advantage of this and return
		them as a generator, reducing memory usage and allowing interruption.

		:param Path file:
		:param Path staging_area:  Path to staging area, so it can be cleaned
		up when the processor is interrupted
		:param Phraser phraser:  Optional. If given, the yielded sentence is
		passed through the phraser to detect (e.g.) bigrams.
		:return list:  A set of tokens
		"""

        if file.suffix == "pb":
            with file.open("rb") as input:
                return pickle.load(input)

        with file.open("r") as input:
            input.seek(1)
            while True:
                line = input.readline()
                if line is None:
                    break

                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while reading tokens")

                if line == "]":
                    # this marks the end of the file
                    return

                try:
                    # the tokeniser dumps the json with one set of tokens per
                    # line, ending with a comma
                    line = line.strip()
                    if line[-1] == ",":
                        line = line[:-1]

                    token_set = json.loads(line)
                    if phraser:
                        yield phraser[token_set]
                    else:
                        yield token_set
                except json.JSONDecodeError:
                    # old-format json dumps are not suitable for the generator
                    # approach
                    input.seek(0)
                    everything = json.load(input)
                    return everything
示例#12
0
    def iterate_archive_contents(self, path, staging_area=None):
        """
		A generator that iterates through files in an archive

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		Files are temporarily unzipped and deleted after use.

		:param Path path: 	Path to zip file to read
		:param Path staging_area:  Where to store the files while they're
		being worked with. If omitted, a temporary folder is created and
		deleted after use
		:return:  An iterator with a Path item for each file
		"""

        if not path.exists():
            return

        if staging_area and (not staging_area.exists()
                             or not staging_area.is_dir()):
            raise RuntimeError("Staging area %s is not a valid folder")
        else:
            if not hasattr(self, "staging_area") and not staging_area:
                self.staging_area = self.dataset.get_staging_area()
                staging_area = self.staging_area

        with zipfile.ZipFile(path, "r") as archive_file:
            archive_contents = sorted(archive_file.namelist())

            for archived_file in archive_contents:
                if self.interrupted:
                    if hasattr(self, "staging_area"):
                        shutil.rmtree(self.staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while iterating zip file contents")

                file_name = archived_file.split("/")[-1]
                temp_file = staging_area.joinpath(file_name)
                archive_file.extract(file_name, staging_area)

                yield temp_file
                if hasattr(self, "staging_area"):
                    temp_file.unlink()

        if hasattr(self, "staging_area"):
            shutil.rmtree(self.staging_area)
            del self.staging_area
示例#13
0
	def fetch_threads(self, thread_ids):
		"""
		Fetch post from database for given threads

		:param list thread_ids: List of thread IDs to return post data for
		:return list: List of posts, with a dictionary representing the database record for each post
		"""
		columns = ", ".join(self.return_cols)

		if self.interrupted:
			raise ProcessorInterruptedException("Interrupted while fetching thread data")

		return self.db.fetchall_interruptable(self.queue,
			"SELECT " + columns + " FROM posts_" + self.prefix + " WHERE thread_id IN %s ORDER BY thread_id ASC, id ASC",
											  (thread_ids,))
示例#14
0
        def resolve_redirect(url, depth=0):
            if self.interrupted:
                raise ProcessorInterruptedException("Interrupted while expanding URL")

            if hasattr(url, "group"):
                url = url.group(0)

            # get host name to compare to list of shorteners
            host_name = re.sub(r"^[a-z]*://", "", url).split("/")[0].lower()

            if depth >= 10:
                return url

            elif "api.parler.com/l" not in url and host_name not in self.redirect_domains:
                # skip non-redirects
                return url

            elif url in cache:
                return cache[url]

            # to avoid infinite recursion, do not go deeper than 5 loops and
            # keep track of current depth here:
            depth += 1

            # do this explicitly because it is a known issue and will save
            # one request
            if host_name == "t.co" and "http://" in url:
                url = url.replace("http://", "https://")

            try:
                time.sleep(0.1)
                head_request = requests.head(url, timeout=5)
            except (requests.RequestException, ConnectionError, ValueError, TimeoutError) as e:
                return url

            # if the returned page's status code is in the 'valid request'
            # range, and if it has a Location header different from the page's
            # url, recursively resolve the page it redirects to up to a given
            # depth - infinite recursion is prevented by using a cache
            if 200 <= head_request.status_code < 400:
                redirected_to = head_request.headers.get("Location", url)
                if redirected_to != url:
                    cache[url] = redirected_to
                    return resolve_redirect(redirected_to, depth)

            return url
示例#15
0
    def unpack_archive_contents(self, path, staging_area=None):
        """
		Unpack all files in an archive to a staging area

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		Files are unzipped to a staging area. The staging area is *not*
		cleaned up automatically.

		:param Path path: 	Path to zip file to read
		:param Path staging_area:  Where to store the files while they're
		being worked with. If omitted, a temporary folder is created and
		deleted after use
		:return Path:  A path to the staging area
		"""

        if not path.exists():
            return

        if staging_area and (not staging_area.exists()
                             or not staging_area.is_dir()):
            raise RuntimeError("Staging area %s is not a valid folder")
        else:
            if not hasattr(self, "staging_area"):
                self.staging_area = self.dataset.get_staging_area()

            staging_area = self.staging_area

        paths = []
        with zipfile.ZipFile(path, "r") as archive_file:
            archive_contents = sorted(archive_file.namelist())

            for archived_file in archive_contents:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while iterating zip file contents")

                file_name = archived_file.split("/")[-1]
                temp_file = staging_area.joinpath(file_name)
                archive_file.extract(archived_file, staging_area)
                paths.append(temp_file)

        return staging_area
示例#16
0
    async def gather_posts(self, client, queries, max_items):
        """
		Gather messages for each entity for which messages are requested

		:param TelegramClient client:  Telegram Client
		:param list queries:  List of entities to query (as string)
		:param int max_items:  Messages to scrape per entity
		:return list:  List of messages, each message a dictionary.
		"""
        posts = []
        for query in queries:
            self.dataset.update_status("Fetching messages for entity '%s'" %
                                       query)
            query_posts = []
            i = 0
            try:
                async for message in client.iter_messages(entity=query):
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Interrupted while fetching message data from the Telegram API"
                        )

                    if i % 500 == 0:
                        self.dataset.update_status(
                            "Retrieved %i posts for entity '%s'" %
                            (len(query_posts) + len(posts), query))

                    if message.action is not None:
                        # e.g. someone joins the channel - not an actual message
                        continue

                    parsed_message = self.import_message(message, query)
                    query_posts.append(parsed_message)

                    i += 1
                    if i > max_items:
                        break
            except (ValueError, UsernameInvalidError) as e:
                self.dataset.update_status("Could not scrape entity '%s'" %
                                           query)

            posts += list(reversed(query_posts))

        return posts
示例#17
0
	def get_post_by_id(self, blog_name, post_id):
		"""
		Fetch individual posts
		:param blog_name, str: The blog's name
		:param id, int: The post ID
		
		returns result list, a list with a dictionary with the post's information
		"""
		if self.interrupted:
			raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr")

		client = self.connect_to_tumblr()
		
		# Request the specific post.
		post = client.posts(blog_name, id=post_id)
	
		# Get the first element of the list - it's always one post.
		result = post["posts"][0]

		return result
示例#18
0
    def write_csv_items_and_finish(self, data):
        """
		Write data as csv to results file and finish dataset

		Determines result file path using dataset's path determination helper
		methods. After writing results, the dataset is marked finished. Will
		raise a ProcessorInterruptedException if the interrupted flag for this
		processor is set while iterating.

		:param data: A list or tuple of dictionaries, all with the same keys
		"""
        if not (isinstance(data, typing.List)
                or isinstance(data, typing.Tuple)) or isinstance(data, str):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        if not data:
            raise ValueError(
                "write_csv_items requires a dictionary with at least one item")

        if not isinstance(data[0], dict):
            raise TypeError(
                "write_csv_items requires a list or tuple of dictionaries as argument"
            )

        self.dataset.update_status("Writing results file")
        with self.dataset.get_results_path().open("w",
                                                  encoding="utf-8",
                                                  newline='') as results:
            writer = csv.DictWriter(results, fieldnames=data[0].keys())
            writer.writeheader()

            for row in data:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results file")
                writer.writerow(row)

        self.dataset.update_status("Finished")
        self.dataset.finish(len(data))
示例#19
0
    def iterate_csv_items(self, path):
        """
		A generator that iterates through a CSV file

		With every iteration, the processor's 'interrupted' flag is checked,
		and if set a ProcessorInterruptedException is raised, which by default
		is caught and subsequently stops execution gracefully.

		:param Path path:  Path to csv file to read
		:return:
		"""
        with open(path, encoding="utf-8") as input:
            reader = csv.DictReader(input)

            for item in reader:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Processor interrupted while iterating through CSV file"
                    )

                yield item
示例#20
0
    def call_penelope_api(self, params, *args, **kwargs):
        """
        Call PENELOPE API and don't crash (immediately) if it fails

        :param params: Call parameters
        :param args:
        :param kwargs:
        :return: Response, or `None`
        """
        #https://penelope.vub.be/parliament-data/get-speeches/<search_query>/<dataset_name>/<start_date>/<end_date>/<max_number>
        url = "https://penelope.vub.be/parliament-data/get-speeches/%s/%s/%s/%s/"
        url = url % (urllib.parse.quote(
            params["dataset_name"]), urllib.parse.quote(
                params["start_date"]), urllib.parse.quote(params["end_date"]),
                     urllib.parse.quote(params["search_query"]))

        retries = 0
        while retries < self.max_retries:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching data from the Penelope API")

            try:
                response = requests.get(url, *args, **kwargs)
                break
            except requests.RequestException as e:
                self.log.info(
                    "Error %s while querying PENELOPE Parliament Speeches API - retrying..."
                    % e)
                retries += 1

        if retries >= self.max_retries:
            self.log.error("Error during PENELOPE fetch of query %s" %
                           self.dataset.key)
            self.dataset.update_status(
                "Error while searching for posts on PENELOPE Parliament Speeches API"
            )
            return None
        else:
            return response.json()["speeches"]
示例#21
0
    def process(self):
        """
		This takes a CSV file as input and writes the same data as a JSON file
		"""
        posts = 0
        self.dataset.update_status("Converting posts")

        # painstaking empirical work has determined that this dialect is
        # compatible with the MacOS version of Microsoft Excel
        csv.register_dialect("excel-mac",
                             delimiter=";",
                             doublequote=True,
                             escapechar=None,
                             lineterminator="\r\n",
                             quotechar='"',
                             quoting=csv.QUOTE_MINIMAL,
                             skipinitialspace=False,
                             strict=False)

        # recreate CSV file with the new dialect
        with self.dataset.get_results_path().open("w") as output:
            fieldnames = self.get_item_keys(self.source_file)

            writer = csv.DictWriter(output,
                                    fieldnames=fieldnames,
                                    dialect="excel-mac")
            writer.writeheader()

            for post in self.iterate_items(self.source_file):
                # stop processing if worker has been asked to stop
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing CSV file")

                writer.writerow(post)
                posts += 1

        # done!
        self.dataset.update_status("Finished.")
        self.dataset.finish(num_rows=posts)
示例#22
0
    def process(self):
        # parse parameters
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        try:
            threshold = float(
                self.parameters.get("threshold",
                                    self.options["threshold"]["default"]))
        except ValueError:
            threshold = float(self.options["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))
        num_words = convert_to_int(self.parameters.get("num-words"),
                                   self.options["num-words"]["default"])
        overlay = self.parameters.get("overlay")
        reduction_method = self.parameters.get("method")
        all_words = self.parameters.get("all-words")

        # load model files and initialise
        self.dataset.update_status("Unpacking word embedding models")
        staging_area = self.unpack_archive_contents(self.source_file)
        common_vocab = None
        vector_size = None
        models = {}

        # find words that are common to all models
        self.dataset.update_status("Determining cross-model common vocabulary")
        for model_file in staging_area.glob("*.model"):
            if self.interrupted:
                shutil.rmtree(staging_area)
                raise ProcessorInterruptedException(
                    "Interrupted while processing word embedding models")

            model = KeyedVectors.load(str(model_file)).wv
            models[model_file.stem] = model
            if vector_size is None:
                vector_size = model.vector_size  # needed later for dimensionality reduction

            if common_vocab is None:
                common_vocab = set(model.vocab.keys())
            else:
                common_vocab &= set(model.vocab.keys())  # intersect

        # sort common vocabulary by combined frequency across all models
        # this should make filtering for common words a bit faster further down
        self.dataset.update_status("Sorting vocabulary")
        common_vocab = list(common_vocab)
        common_vocab.sort(key=lambda w: sum(
            [model.vocab[w].count for model in models.values()]),
                          reverse=True)

        # initial boundaries of 2D space (to be adjusted later based on t-sne
        # outcome)
        max_x = 0.0 - sys.float_info.max
        max_y = 0.0 - sys.float_info.max
        min_x = sys.float_info.max
        min_y = sys.float_info.max

        # for each model, find the words that we may want to plot - these are
        # the nearest neighbours for the given query words
        relevant_words = {}

        # the vectors need to be reduced all at once - but the vectors are
        # grouped by model. To solve this, keep one numpy array of vectors,
        # but also keep track of which indexes of this array belong to which
        # model, by storing the index of the first vector for a model
        vectors = numpy.empty((0, vector_size))
        vector_offsets = {}

        # now process each model
        for model_name, model in models.items():
            relevant_words[model_name] = set(
            )  # not a set, since order needs to be preserved
            self.dataset.update_status("Finding similar words in model '%s'" %
                                       model_name)

            for query in input_words:
                if query not in model.vocab:
                    self.dataset.update_status(
                        "Query '%s' was not found in model %s; cannot find nearest neighbours."
                        % (query, model_name),
                        is_final=True)
                    self.dataset.finish(0)
                    return

                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while finding similar words")

                # use a larger sample (topn) than required since some of the
                # nearest neighbours may not be in the common vocabulary and
                # will therefore need to be ignored
                context = set([
                    word[0] for word in model.most_similar(query, topn=1000)
                    if word[0] in common_vocab and word[1] >= threshold
                ][:num_words])

                relevant_words[model_name] |= {
                    query
                } | context  # always include query word

        # now do another loop to determine which words to plot for each model
        # this is either the same as relevant_words, or a superset which
        # combines all relevant words for all models
        plottable_words = {}
        last_model = max(relevant_words.keys())
        all_relevant_words = set().union(*relevant_words.values())

        for model_name, words in relevant_words.items():
            plottable_words[model_name] = []
            vector_offsets[model_name] = len(vectors)

            # determine which words to plot for this model. either the nearest
            # neighbours for this model, or all nearest neighbours found across
            # all models
            words_to_include = all_relevant_words if all_words else relevant_words[
                model_name]

            for word in words_to_include:
                if word in plottable_words[model_name] or (
                        not overlay and model_name != last_model
                        and word not in input_words):
                    # only plot each word once per model, or if 'overlay'
                    # is not set, only once overall (for the most recent
                    # model)
                    continue

                vector = models[model_name][word]
                plottable_words[model_name].append(word)
                vectors = numpy.append(vectors, [vector], axis=0)

        del models  # no longer needed

        # reduce the vectors of all words to be plotted for this model to
        # a two-dimensional coordinate with the previously initialised tsne
        # transformer. here the two-dimensional vectors are interpreted as
        # cartesian coordinates
        if reduction_method == "PCA":
            pca = PCA(n_components=2, random_state=0)
            vectors = pca.fit_transform(vectors)
        elif reduction_method == "t-SNE":
            # initialise t-sne transformer
            # parameters taken from Hamilton et al.
            # https://github.com/williamleif/histwords/blob/master/viz/common.py
            tsne = TSNE(n_components=2,
                        random_state=0,
                        learning_rate=150,
                        init="pca")
            vectors = tsne.fit_transform(vectors)
        elif reduction_method == "TruncatedSVD":
            # standard sklearn parameters made explicit
            svd = TruncatedSVD(n_components=2,
                               algorithm="randomized",
                               n_iter=5,
                               random_state=0)
            vectors = svd.fit_transform(vectors)
        else:
            shutil.rmtree(staging_area)
            self.dataset.update_status(
                "Invalid dimensionality reduction technique selected",
                is_final=True)
            self.dataset.finish(0)
            return

        # also keep track of the boundaries of our 2D space, so we can plot
        # them properly later
        for position in vectors:
            max_x = max(max_x, position[0])
            max_y = max(max_y, position[1])
            min_x = min(min_x, position[0])
            min_y = min(min_y, position[1])

        # now we know for each model which words should be plotted and at what
        # position
        # with this knowledge, we can normalize the positions, and start
        # plotting them in a graph

        # a palette generated with https://medialab.github.io/iwanthue/
        colours = [
            "#d58eff", "#cf9000", "#3391ff", "#a15700", "#911ca7", "#00ddcb",
            "#cc25a9", "#d5c776", "#6738a8", "#ff9470", "#47c2ff", "#a4122c",
            "#00b0ca", "#9a0f76", "#ff70c8", "#713c88"
        ]
        colour_index = 0

        # make sure all coordinates are positive
        max_x -= min_x
        max_y -= min_y

        # determine graph dimensions and proportions
        width = 1000  # arbitrary
        height = width * (max_y / max_x)  # retain proportions
        scale = width / max_x

        # margin around the plot to give room for labels and to look better
        margin = width * 0.1
        width += 2 * margin
        height += 2 * margin

        # normalize all known positions to fit within the graph
        vectors = [(margin + ((position[0] - min_x) * scale),
                    margin + ((position[1] - min_y) * scale))
                   for position in vectors]

        # now all positions are finalised, we can determine the "journey" of
        # each query - the sequence of positions in the graph it takes, so we
        # can draw lines from position to position later
        journeys = {}
        for query in input_words:
            journeys[query] = []
            for model_name, words in plottable_words.items():
                index = words.index(query)
                journeys[query].append(vectors[vector_offsets[model_name] +
                                               index])

        # font sizes proportional to width (which is static and thus predictable)
        fontsize_large = width / 50
        fontsize_normal = width / 75
        fontsize_small = width / 100

        # now we have the dimensions, the canvas can be instantiated
        model_type = self.source_dataset.parameters.get(
            "model-type", "word2vec")
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width,
            height,
            header="%s nearest neighbours (fitting: %s) - '%s'" %
            (model_type, reduction_method, ",".join(input_words)),
            fontsize_normal=fontsize_normal,
            fontsize_large=fontsize_large,
            fontsize_small=fontsize_small)

        # use colour-coded backgrounds to distinguish the query words in the
        # graph, each model (= interval) with a separate colour
        for model_name in plottable_words:
            solid = Filter(id="solid-%s" % model_name)
            solid.feFlood(flood_color=colours[colour_index])
            solid.feComposite(in_="SourceGraphic")
            canvas.defs.add(solid)
            colour_index += 1

        # now plot each word for each model
        self.dataset.update_status("Plotting graph")
        words = SVG(insert=(0, 0), size=(width, height))
        queries = SVG(insert=(0, 0), size=(width, height))
        colour_index = 0

        for model_name, labels in plottable_words.items():
            positions = vectors[
                vector_offsets[model_name]:vector_offsets[model_name] +
                len(labels)]

            label_index = 0
            for position in positions:
                word = labels[label_index]
                is_query = word in input_words
                label_index += 1

                filter = ("url(#solid-%s)" %
                          model_name) if is_query else "none"
                colour = "#FFF" if is_query else colours[colour_index]
                fontsize = fontsize_normal if is_query else fontsize_small

                if word in input_words:
                    word += " (" + model_name + ")"

                label_container = SVG(insert=position,
                                      size=(1, 1),
                                      overflow="visible")
                label_container.add(
                    Text(insert=("50%", "50%"),
                         text=word,
                         dominant_baseline="middle",
                         text_anchor="middle",
                         style="fill:%s;font-size:%ipx" % (colour, fontsize),
                         filter=filter))

                # we make sure the queries are always rendered on top by
                # putting them in a separate SVG container
                if is_query:
                    queries.add(label_container)
                else:
                    words.add(label_container)

            colour_index = 0 if colour_index >= len(
                colours) else colour_index + 1

        # plot a line between positions for query words
        lines = SVG(insert=(0, 0), size=(width, height))
        for query, journey in journeys.items():
            previous_position = None
            for position in journey:
                if previous_position is None:
                    previous_position = position
                    continue

                lines.add(
                    Line(start=previous_position,
                         end=position,
                         stroke="#CE1B28",
                         stroke_width=2))
                previous_position = position

        canvas.add(lines)
        canvas.add(words)
        canvas.add(queries)

        canvas.save(pretty=True)
        shutil.rmtree(staging_area)
        self.dataset.finish(len(journeys))
示例#23
0
	def get_post_notes(self, di_blogs_ids, only_text_reblogs=True):
		"""
		Gets the post notes.
		:param di_blogs_ids, dict: A dictionary with blog names as keys and post IDs as values.
		:param only_text_reblogs, bool: Whether to only keep notes that are text reblogs.
		"""

		client = self.connect_to_tumblr()

		# List of dict to get reblogs. Items are: [{"blog_name": post_id}]
		text_reblogs = []

		max_date = None

		# Do some counting
		len_blogs = len(di_blogs_ids)
		count = 0

		# Stop trying to fetch the notes after this many retries
		max_notes_retries = 10
		notes_retries = 0

		for key, value in di_blogs_ids.items():

			count += 1

			if self.interrupted:
				raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr")

			# First, get the blog names and post_ids from reblogs
			# Keep digging till there's nothing left, or if we can fetch no new notes
			while True:

				# Requests a post's notes
				notes = client.notes(key, id=value, before_timestamp=max_date)

				if only_text_reblogs:

					if "notes" in notes:
						notes_retries = 0

						for note in notes["notes"]:
							# If it's a reblog, extract the data and save the rest of the posts for later
							if note["type"] == "reblog":
								if note.get("added_text"):
									text_reblogs.append({note["blog_name"]: note["post_id"]})

						if notes.get("_links"):
							max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]

						# If there's no `_links` key, that's all.
						else:
							break

					# If there's no "notes" key in the returned dict, something might be up
					else:
						self.log.update_status("Couldn't get notes for Tumblr request " + str(notes))
						notes_retries += 1
						pass

					if notes_retries > max_notes_retries:
						self.failed_notes.append(key)
						break

			self.dataset.update_status("Identified %i text reblogs in %i/%i notes" % (len(text_reblogs), count, len_blogs))

		return text_reblogs
示例#24
0
	def get_posts_by_blog(self, blog, max_date=None, min_date=None):
		"""
		Get Tumblr posts posts with a certain blog
		:param tag, str: the name of the blog you want to look for
		:param min_date: a unix timestamp, indicates posts should be min_date this date.
	    :param max_date: a unix timestamp, indicates posts should be max_date this date.

	    :returns: a dict created from the JSON response
		"""

		blog = blog + ".tumblr.com"
		client = self.connect_to_tumblr()

		if not max_date:
			max_date = int(time.time())

		# Store all posts in here
		all_posts = []

		# Store notes here, if they exist and are requested
		all_notes = []

		# Some retries to make sure the Tumblr API actually returns everything
		retries = 0
		self.max_retries = 48 # 2 days

		# Get Tumblr posts until there's no more left.
		while True:
			if self.interrupted:
				raise ProcessorInterruptedException("Interrupted while fetching blog posts from Tumblr")

			# Stop min_date 20 retries
			if retries >= self.max_retries:
				self.dataset.update_status("No more posts")
				break

			try:
				# Use the pytumblr library to make the API call
				posts = client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw")
				posts = posts["posts"]

				#if (max_date - posts[0]["timestamp"]) > 500000:
					#self.dataset.update_status("ALERT - DATES LIKELY SKIPPED")
					#self.dataset.update_status([post["timestamp"] for post in posts])

			except Exception as e:

				self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date))
				self.api_limit_reached = True
				break

			# Make sure the Tumblr API doesn't magically stop at an earlier date
			if not posts or isinstance(posts, str):
				retries += 1
				max_date -= 3600 # Decrease by an hour
				self.dataset.update_status("No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries))
				continue

			# Append posts to main list
			else:
				# Keep the notes, if so indicated
				if self.parameters.get("fetch_reblogs"):
					for post in posts:
						if "notes" in post:
							all_notes.append(post["notes"])

				posts = self.parse_tumblr_posts(posts)

				# Get the lowest date
				max_date = sorted([post["timestamp"] for post in posts])[0]

				# Manually check if we have a lower date than the min date (`min_date`) already.
				# This functonality is not natively supported by Tumblr.
				if min_date:
					if max_date < min_date:

						# Get rid of all the posts that are earlier than the max_date timestamp
						posts = [post for post in posts if post["timestamp"] >= min_date]

						if posts:
							all_posts += posts
						break

				retries = 0

				all_posts += posts

				#if (max_date - posts[len(posts) - 1]["timestamp"]) > 500000:
					#self.dataset.update_status("ALERT - DATES LIKELY SKIPPED")
					#self.dataset.update_status([post["timestamp"] for post in posts])

			if len(all_posts) >= self.max_posts:
				self.max_posts_reached = True
				break

			self.dataset.update_status("Collected %s posts" % str(len(all_posts)))

		return all_posts, all_notes
示例#25
0
	def get_posts_by_tag(self, tag, max_date=None, min_date=None):
		"""
		Get Tumblr posts posts with a certain tag
		:param tag, str: the tag you want to look for
		:param min_date: a unix timestamp, indicates posts should be min_date this date.
	    :param max_date: a unix timestamp, indicates posts should be max_date this date.

	    :returns: a dict created from the JSON response
		"""

		client = self.connect_to_tumblr()

		# Store all posts in here
		all_posts = []

		# Some retries to make sure the Tumblr API actually returns everything.
		retries = 0
		date_retries = 0

		# We're gonna change max_date, so store a copy for reference.
		max_date_original = max_date

		# We use the averag time difference between posts to spot possible gaps in the data.
		all_time_difs = []
		avg_time_dif = 0
		time_difs_len = 0

		# Get Tumblr posts until there's no more left.
		while True:
			if self.interrupted:
				raise ProcessorInterruptedException("Interrupted while fetching tag posts from Tumblr")

			# Stop after max for date reductions
			if date_retries >= self.max_date_retries:
				self.dataset.update_status("No more posts in this date range")
				break

			# Stop after max retries for API/connection stuff
			if retries >= self.max_retries:
				self.dataset.update_status("No more posts")
				break

			try:
				# Use the pytumblr library to make the API call
				posts = client.tagged(tag, before=max_date, limit=20, filter="raw")
			except ConnectionError:
				self.update_status("Encountered a connection error, waiting 10 seconds.")
				time.sleep(10)
				retries += 1
				continue

			# Get rid of posts that we already enountered,
			# preventing Tumblr API shenanigans or double posts because of
			# time reductions. Make sure it's no odd error string, though.
			unseen_posts = []
			for check_post in posts:
				# Sometimes the API repsonds just with "meta", "response", or "errors".
				if isinstance(check_post, str):
					self.dataset.update_status("Couldnt add post:", check_post)
					retries += 1
					break
				else:
					retries = 0
					if check_post["id"] not in self.seen_ids:
						unseen_posts.append(check_post)
			posts = unseen_posts

			# For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested.
			# So we have to prevent this manually.
			if max_date_original:
				posts = [post for post in posts if post["timestamp"] <= max_date_original]

			max_date_str = datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S")
			
			# except Exception as e:
			# 	print(e)
			# 	self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date))
			# 	self.api_limit_reached = True
			# 	break

			# Make sure the Tumblr API doesn't magically stop at an earlier date
			if not posts:

				date_retries += 1

				# We're first gonna check carefully if there's small timegaps by
				# decreasing by six hours.
				# If that didn't result in any new posts, also dedicate 12 date_retries
				# with reductions of six months, just to be sure there's no data from
				# years earlier missing.

				if date_retries < 96:
					max_date -= 21600 # Decrease by six hours
					self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - decreasing time search with 6 hours to %s to make sure this is really it (retry %s/96)" % (str(len(all_posts)), tag, max_date_str, str(date_retries),))
				elif date_retries <= self.max_date_retries:
					max_date -= 604800 # Decrease by one week
					retry_str = str(date_retries - 96)
					self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - no new posts found with decreasing by 6 hours, decreasing with a week to %s instead (retry %s/150)" % (str(len(all_posts)), tag, max_date_str, str(retry_str),))

				# We can stop when the max date drops below the min date.
				if max_date <= min_date:
					break

				continue

			# Append posts to main list
			else:

				posts = self.parse_tumblr_posts(posts)
				
				# Get all timestamps and sort them.
				post_dates = sorted([post["timestamp"] for post in posts])
				
				# Get the lowest date and use it as the next "before" parameter.
				max_date = post_dates[0]

				# Tumblr's API is volatile - it doesn't neatly sort posts by date,
				# so it can happen that there's suddenly huge jumps in time.
				# Check if this is happening by extracting the difference between all consecutive dates.
				time_difs = list()
				post_dates.reverse()

				for i, date in enumerate(post_dates):

					if i == (len(post_dates) - 1):
						break

					# Calculate and add time differences
					time_dif = date - post_dates[i + 1]

					# After having collected 250 posts, check whether the time
					# difference between posts far exceeds the average time difference
					# between posts. If it's more than five times this amount,
					# restart the query with the timestamp just before the gap, minus the 
					# average time difference up to this point - something might be up with Tumblr's API.
					if len(all_posts) >= 250 and time_dif > (avg_time_dif * 5):

						time_str = datetime.fromtimestamp(date).strftime("%Y-%m-%d %H:%M:%S")
						self.dataset.update_status("Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,))

						self.seen_ids.update([post["id"] for post in posts])
						posts = [post for post in posts if post["timestamp"] >= date]
						if posts:
							all_posts += posts
						
						max_date = date
						break

					time_difs.append(time_dif)
				
				# To start a new query
				if not posts:
					break

				# Manually check if we have a lower date than the lowest allowed date already (min date).
				# This functonality is not natively supported by Tumblr.
				if min_date:
					if max_date < min_date:
					
						# Get rid of all the posts that are earlier than the max_date timestamp
						posts = [post for post in posts if post["timestamp"] >= min_date and post["timestamp"] <= max_date_original]
						
						if posts:
							all_posts += posts
							self.seen_ids.update([post["id"] for post in posts])
						break

				# We got a new post, so we can reset the retry counts.
				date_retries = 0
				retries = 0

				# Add retrieved posts top the main list
				all_posts += posts

				# Add to seen ids
				self.seen_ids.update([post["id"] for post in posts])
				
				# Add time differences and calculate new average time difference
				all_time_difs += time_difs

				# Make the average time difference a moving average,
				# to be flexible with faster and slower post paces.
				# Delete the first 100 posts every hundred or so items.
				if (len(all_time_difs) - time_difs_len) > 100:
					all_time_difs = all_time_difs[time_difs_len:]
				if all_time_difs:
					time_difs_len = len(all_time_difs)
					avg_time_dif = sum(all_time_difs) / len(all_time_difs)

			if len(all_posts) >= self.max_posts:
				self.max_posts_reached = True
				break

			self.dataset.update_status("Collected %s posts for tag %s, now looking for posts before %s" % (str(len(all_posts)), tag, max_date_str,))

		return all_posts
示例#26
0
文件: search.py 项目: p-charis/4cat
    def items_to_csv(self, results, filepath):
        """
		Takes a dictionary of results, converts it to a csv, and writes it to the
		given location. This is mostly a generic dictionary-to-CSV processor but
		some specific processing is done on the "body" key to strip HTML from it,
		and a human-readable timestamp is provided next to the UNIX timestamp.

		:param results:			List of dict rows from data source.
		:param filepath:    	Filepath for the resulting csv

		:return int:  Amount of posts that were processed

		"""
        if not filepath:
            raise ResourceWarning("No result file for query")

        # write the dictionary to a csv
        if not isinstance(filepath, Path):
            filepath = Path(filepath)

        # cache hashed author names, so the hashing function (which is
        # relatively expensive) is not run too often
        pseudonymise_author = bool(self.parameters.get("pseudonymise", None))
        hash_cache = {}

        # prepare hasher (which we may or may not need)
        # we use BLAKE2	for its (so far!) resistance against cryptanalysis and
        # speed, since we will potentially need to calculate a large amount of
        # hashes
        hasher = hashlib.blake2b(digest_size=24)
        hasher.update(str(config.ANONYMISATION_SALT).encode("utf-8"))

        processed = 0
        header_written = False
        with filepath.open("w", encoding="utf-8") as csvfile:
            # Parsing: remove the HTML tags, but keep the <br> as a newline
            # Takes around 1.5 times longer
            for row in results:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while writing results to file")

                if not header_written:
                    fieldnames = list(row.keys())
                    fieldnames.append("unix_timestamp")
                    writer = csv.DictWriter(csvfile,
                                            fieldnames=fieldnames,
                                            lineterminator='\n')
                    writer.writeheader()
                    header_written = True

                processed += 1

                # Create human dates from timestamp
                from datetime import datetime, timezone

                if "timestamp" in row:
                    # Data sources should have "timestamp" as a unix epoch integer,
                    # but do some conversion if this is not the case.
                    timestamp = row["timestamp"]
                    if not isinstance(timestamp, int):
                        if isinstance(
                                timestamp, str
                        ) and "-" not in timestamp:  # String representation of epoch timestamp
                            timestamp = int(timestamp)
                        elif isinstance(
                                timestamp,
                                str) and "-" in timestamp:  # Date string
                            try:
                                timestamp = datetime.strptime(
                                    timestamp, "%Y-%m-%d %H:%M:%S").replace(
                                        tzinfo=timezone.utc).timestamp()
                            except ValueError:
                                timestamp = "undefined"
                        else:
                            timestamp = "undefined"

                    # Add a human-readable date format as well, if we have a valid timestamp.
                    row["unix_timestamp"] = timestamp
                    if timestamp != "undefined":
                        row["timestamp"] = datetime.utcfromtimestamp(
                            timestamp).strftime('%Y-%m-%d %H:%M:%S')
                    else:
                        row["timestamp"] = timestamp
                else:
                    row["timestamp"] = "undefined"

                # Parse html to text
                if row["body"]:
                    row["body"] = strip_tags(row["body"])

                # replace author column with salted hash of the author name, if
                # pseudonymisation is enabled
                if pseudonymise_author:
                    author_fields = [
                        field for field in row.keys() if "author" in field
                    ]
                    for author_field in author_fields:
                        if row[author_field] not in hash_cache:
                            author_hasher = hasher.copy()
                            author_hasher.update(
                                str(row[author_field]).encode("utf-8"))
                            hash_cache[
                                row[author_field]] = author_hasher.hexdigest()
                            del author_hasher

                        row[author_field] = hash_cache[row[author_field]]

                writer.writerow(row)

        return processed
示例#27
0
    def process(self):
        """
		This takes a 4CAT results file as input, and outputs a new CSV file
		with IDs and post bodies for all posts as well as a number of metrics
		derived from the hatebase database, e.g. number of matching items,
		how ambiguous the hatefulness is and the average 'offensiveness'.
		"""
        processed = 0
        parent = self.dataset.get_genealogy()[-2]

        # determine what vocabulary to use
        language = self.parameters.get("language", "")
        if language not in self.options["language"]["options"]:
            language = self.options["language"]["default"]

        # read and convert to a way we can easily match whether any word occurs
        with open(config.PATH_ROOT +
                  "/backend/assets/hatebase/hatebase-%s.json" %
                  language) as hatebasedata:
            hatebase = json.loads(hatebasedata.read())

        hatebase = {term.lower(): hatebase[term] for term in hatebase}
        hatebase_regex = re.compile(
            r"\b(" + "|".join([re.escape(term) for term in hatebase]) + r")\b")

        processed = 0
        with self.dataset.get_results_path().open("w") as output:
            with self.source_file.open() as input:
                reader = csv.DictReader(input)
                fieldnames = reader.fieldnames
                fieldnames += ("hatebase_num", "hatebase_num_ambiguous",
                               "hatebase_num_unambiguous", "hatebase_terms",
                               "hatebase_terms_ambiguous",
                               "hatebase_terms_unambiguous",
                               "hatebase_offensiveness_avg")

                writer = csv.DictWriter(output, fieldnames=fieldnames)
                writer.writeheader()

            for post in self.iterate_csv_items(self.source_file):
                # stop processing if worker has been asked to stop
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while processing posts")

                processed += 1
                if processed % 1000 == 0:
                    self.dataset.update_status("Processing post %i" %
                                               processed)
                row = {
                    **post,
                    **{
                        "hatebase_num": 0,
                        "hatebase_num_ambiguous": 0,
                        "hatebase_num_unambiguous": 0,
                        "hatebase_terms": "",
                        "hatebase_terms_ambiguous": "",
                        "hatebase_terms_unambiguous": "",
                        "hatebase_offensiveness_avg": 0,
                    }
                }

                terms = []
                terms_ambig = []
                terms_unambig = []
                for term in hatebase_regex.findall(post["body"].lower()):
                    if hatebase[term]["plural_of"]:
                        if hatebase[term]["plural_of"] in terms:
                            continue
                        elif hatebase[term]["plural_of"] in hatebase:
                            term = hatebase[term]["plural_of"]

                    terms.append(term)
                    row["hatebase_num"] += 1
                    if hatebase[term]["is_unambiguous"]:
                        row["hatebase_num_unambiguous"] += 1
                        terms_unambig.append(term)
                    else:
                        row["hatebase_num_ambiguous"] += 1
                        terms_ambig.append(term)

                    if hatebase[term]["average_offensiveness"]:
                        row["hatebase_offensiveness_avg"] += hatebase[term][
                            "average_offensiveness"]

                row["hatebase_terms"] = ",".join(terms)
                row["hatebase_terms_ambiguous"] = ",".join(terms_ambig)
                row["hatebase_terms_unambiguous"] = ",".join(terms_unambig)

                if len(terms) > 0:
                    row["hatebase_offensiveness_avg"] = int(
                        int(row["hatebase_offensiveness_avg"]) / len(terms))

                try:
                    writer.writerow(row)
                except ValueError:
                    self.dataset.update_status(
                        "Cannot write results. Your input file may contain invalid CSV data."
                    )
                    self.dataset.finish(0)
                    return

        self.dataset.update_status("Finished")
        self.dataset.finish(processed)
示例#28
0
    def download_thumbnails(self, video_ids):
        """
		Download video thumbnails
		:param video_ids list, list of YouTube video IDs
		"""

        # prepare staging area
        results_path = self.dataset.get_staging_area()

        # Use YouTubeDL and the YouTube API to request video data
        youtube = build(config.YOUTUBE_API_SERVICE_NAME,
                        config.YOUTUBE_API_VERSION,
                        developerKey=config.YOUTUBE_DEVELOPER_KEY)

        ids_list = get_yt_compatible_ids(video_ids)
        retries = 0

        for i, ids_string in enumerate(ids_list):
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while downloading thumbnails from YouTube")

            while retries < self.max_retries:
                try:
                    response = youtube.videos().list(part="snippet",
                                                     id=ids_string,
                                                     maxResults=50).execute()
                    break
                except Exception as error:
                    self.dataset.update_status("Encountered exception " +
                                               str(error) +
                                               ".\nSleeping for " +
                                               str(self.sleep_time))
                    retries += 1
                    api_error = error
                    time.sleep(
                        self.sleep_time)  # Wait a bit before trying again

            # Do nothing with the results if the requests failed -
            # be in the final results file
            if retries >= self.max_retries:
                self.dataset.update_status("Error during YouTube API request")
            else:
                # Get and return results for each video
                for metadata in response["items"]:

                    # Get the URL of the thumbnail
                    thumb_url = metadata["snippet"]["thumbnails"]["high"][
                        "url"]
                    # Format the path to save the thumbnail to
                    save_path = results_path.joinpath(
                        metadata["id"] + "." + str(thumb_url.split('.')[-1]))
                    # Download the image
                    urllib.request.urlretrieve(thumb_url, save_path)

            self.dataset.update_status("Downloaded thumbnails for " +
                                       str(i * 50) + "/" + str(len(video_ids)))

        # create zip of archive and delete temporary files and folder
        self.dataset.update_status("Compressing results into archive")

        # Save the count of images for `finish` function
        image_count = 0

        self.write_archive_and_finish(results_path)
示例#29
0
    def get_items(self, query):
        """
		Run custom search

		Fetches data from Instagram via instaloader.
		"""
        # this is useful to include in the results because researchers are
        # always thirsty for them hashtags
        hashtag = re.compile(r"#([^\s,.+=-]+)")
        mention = re.compile(r"@([a-zA-Z0-9_]+)")

        instagram = instaloader.Instaloader(quiet=True,
                                            download_pictures=False,
                                            download_videos=False,
                                            download_comments=True,
                                            download_geotags=False,
                                            download_video_thumbnails=False,
                                            compress_json=False,
                                            save_metadata=True)

        # ready our parameters
        parameters = self.dataset.get_parameters()
        scope = parameters.get("search_scope", "")
        queries = [
            query.strip() for query in parameters.get("query", "").split(",")
        ]

        posts = []
        max_posts = self.dataset.parameters.get("items", 500)

        # for each query, get items
        for query in queries:
            chunk_size = 0
            self.dataset.update_status("Retrieving posts ('%s')" % query)
            try:
                if scope == "hashtag":
                    query = query.replace("#", "")
                    chunk = instagram.get_hashtag_posts(query)
                elif scope == "username":
                    query = query.replace("@", "")
                    profile = instaloader.Profile.from_username(
                        instagram.context, query)
                    chunk = profile.get_posts()
                else:
                    self.log.warning(
                        "Invalid search scope for instagram scraper: %s" %
                        repr(scope))
                    return []

                # "chunk" is a generator so actually retrieve the posts next
                posts_processed = 0
                for post in chunk:
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Interrupted while fetching posts from Instagram")

                    chunk_size += 1
                    self.dataset.update_status(
                        "Retrieving posts ('%s', %i posts)" %
                        (query, chunk_size))
                    if posts_processed >= max_posts:
                        break
                    try:
                        posts.append(chunk.__next__())
                        posts_processed += 1
                    except StopIteration:
                        break
            except instaloader.InstaloaderException as e:
                # should we abort here and return 0 posts?
                self.log.warning("Instaloader exception during query %s: %s" %
                                 (self.dataset.key, e))
                self.dataset.update_status(
                    "Error while retrieving posts for query '%s'" % query)

        # go through posts, and retrieve comments
        results = []
        posts_processed = 0
        comments_bit = " and comments" if self.parameters.get(
            "scrape_comments", False) else ""

        for post in posts:
            if self.interrupted:
                raise ProcessorInterruptedException(
                    "Interrupted while fetching post metadata from Instagram")

            posts_processed += 1
            self.dataset.update_status("Retrieving metadata%s for post %i" %
                                       (comments_bit, posts_processed))

            thread_id = post.shortcode

            try:
                results.append({
                    "id":
                    thread_id,
                    "thread_id":
                    thread_id,
                    "parent_id":
                    thread_id,
                    "body":
                    post.caption if post.caption is not None else "",
                    "author":
                    post.owner_username,
                    "timestamp":
                    int(post.date_utc.timestamp()),
                    "type":
                    "video" if post.is_video else "picture",
                    "url":
                    post.video_url if post.is_video else post.url,
                    "thumbnail_url":
                    post.url,
                    "hashtags":
                    ",".join(post.caption_hashtags),
                    "usertags":
                    ",".join(post.tagged_users),
                    "mentioned":
                    ",".join(
                        mention.findall(post.caption) if post.caption else ""),
                    "num_likes":
                    post.likes,
                    "num_comments":
                    post.comments,
                    "subject":
                    ""
                })
            except (instaloader.QueryReturnedNotFoundException,
                    instaloader.ConnectionException):
                pass

            if not self.parameters.get("scrape_comments", False):
                continue

            try:
                for comment in post.get_comments():
                    answers = [answer for answer in comment.answers]

                    try:
                        results.append({
                            "id":
                            comment.id,
                            "thread_id":
                            thread_id,
                            "parent_id":
                            thread_id,
                            "body":
                            comment.text,
                            "author":
                            comment.owner.username,
                            "timestamp":
                            int(comment.created_at_utc.timestamp()),
                            "type":
                            "comment",
                            "url":
                            "",
                            "hashtags":
                            ",".join(hashtag.findall(comment.text)),
                            "usertags":
                            "",
                            "mentioned":
                            ",".join(mention.findall(comment.text)),
                            "num_likes":
                            comment.likes_count if hasattr(
                                comment, "likes_count") else 0,
                            "num_comments":
                            len(answers),
                            "subject":
                            ""
                        })
                    except instaloader.QueryReturnedNotFoundException:
                        pass

                    # instagram only has one reply depth level at the time of
                    # writing, represented here
                    for answer in answers:
                        try:
                            results.append({
                                "id":
                                answer.id,
                                "thread_id":
                                thread_id,
                                "parent_id":
                                comment.id,
                                "body":
                                answer.text,
                                "author":
                                answer.owner.username,
                                "timestamp":
                                int(answer.created_at_utc.timestamp()),
                                "type":
                                "comment",
                                "url":
                                "",
                                "hashtags":
                                ",".join(hashtag.findall(answer.text)),
                                "usertags":
                                "",
                                "mentioned":
                                ",".join(mention.findall(answer.text)),
                                "num_likes":
                                answer.likes_count if hasattr(
                                    answer, "likes_count") else 0,
                                "num_comments":
                                0,
                                "subject":
                                ""
                            })
                        except instaloader.QueryReturnedNotFoundException:
                            pass

            except (instaloader.QueryReturnedNotFoundException,
                    instaloader.ConnectionException):
                # data not available...? this happens sometimes, not clear why
                pass

        # remove temporary fetched data and return posts
        return results
示例#30
0
    def process(self):
        """
		Reads a CSV file, counts occurences of chosen values over all posts,
		and aggregates the results per chosen time frame
		"""

        # convenience variables
        timeframe = self.parameters.get("timeframe",
                                        self.options["timeframe"]["default"])
        scope = self.parameters.get("scope", self.options["scope"]["default"])
        min_offensive = self.parameters.get(
            "hatefulness-score", self.options["hatefulness-score"]["default"])

        # determine what vocabulary to use
        language = self.parameters.get("language", "")
        if language not in self.options["language"]["options"]:
            language = self.options["language"]["default"]

        # now for the real deal
        self.dataset.update_status("Reading source file")
        activity = {}
        hateful = {}
        views = {}
        intervals = set()

        with self.source_file.open() as input:
            reader = csv.DictReader(input)
            if "views" in reader.fieldnames:
                engagement_field = "views"
            elif "score" in reader.fieldnames:
                engagement_field = "score"
            elif "likes" in reader.fieldnames:
                engagement_field = "likes"
            else:
                self.dataset.update_status(
                    "No engagement metric available for dataset, cannot chart over-time engagement."
                )
                self.dataset.finish(0)
                return

        with open(config.PATH_ROOT +
                  "/backend/assets/hatebase/hatebase-%s.json" %
                  language) as hatebasedata:
            hatebase = json.loads(hatebasedata.read())

        hatebase = {term.lower(): hatebase[term] for term in hatebase}
        hatebase_regex = re.compile(r"\b(" + "|".join([
            re.escape(term) for term in hatebase if not min_offensive or
            (hatebase[term]["average_offensiveness"]
             and hatebase[term]["average_offensiveness"] > min_offensive)
        ]) + r")\b")

        with open(self.source_file, encoding='utf-8') as source:
            csvfile = csv.DictReader(source)
            for post in csvfile:
                if self.interrupted:
                    raise ProcessorInterruptedException(
                        "Interrupted while reading input")

                time_unit = get_interval_descriptor(post, timeframe)

                # determine where to put this data
                if time_unit not in activity:
                    activity[time_unit] = 0

                if time_unit not in hateful:
                    hateful[time_unit] = 0

                if time_unit not in views:
                    views[time_unit] = 0

                intervals.add(time_unit)

                activity[time_unit] += 1
                try:
                    views[time_unit] += int(post[engagement_field])
                except (ValueError, TypeError):
                    pass

                terms = []
                for term in hatebase_regex.findall(post["body"].lower()):
                    if not term:
                        continue
                    if "plural_of" in hatebase[term] and hatebase[term][
                            "plural_of"]:
                        if hatebase[term]["plural_of"] in terms:
                            continue
                        elif hatebase[term]["plural_of"] in hatebase:
                            term = hatebase[term]["plural_of"]

                        if scope == "ambiguous" and not hatebase[term][
                                "is_unambiguous"]:
                            terms.append(term)
                        elif scope == "unambiguous" and hatebase[term][
                                "is_unambiguous"]:
                            terms.append(term)
                        elif scope == "all":
                            terms.append(term)

                hateful[time_unit] += len(terms)

        rows = []
        for interval in sorted(intervals):
            rows.append({
                "date": interval,
                "item": "offensive language",
                "frequency": hateful[interval]
            })
            rows.append({
                "date": interval,
                "item": "messages",
                "frequency": activity[interval]
            })
            rows.append({
                "date": interval,
                "item": engagement_field,
                "frequency": views[interval]
            })

        # write as csv
        if rows:
            self.write_csv_items_and_finish(rows)
        else:
            self.dataset.finish(0)