def index(self, config) -> LocalElementsIndex: error = config["error"] if "error" in config else "" if error == "index": raise SelectorIndexError("test") else: elements = ["skip", "retry3", "retryN", "pass"] return LocalElementsIndex(rows=scaffold_elementmap(elements))
def index(self, _) -> LocalElementsIndex: results = self._run() if len(results) > 0: out = [] out.append(list(results[0].keys())) out.extend([x.values() for x in results]) return LocalElementsIndex(out) return None
def index(self, config): if not os.path.exists(self.disk.read_query(self.name)): df = scaffold_elementmap(["el1", "el2", "el3"]) df = [ x + [STUB_PATHS.imagejpg] if idx > 0 else (x + ["path"]) for idx, x in enumerate(df) ] return LocalElementsIndex(rows=df) else: return None
def index(self, config): c = twint.Config() c.Search = config["search_term"] c.Since = config["uploaded_after"] c.Until = config["uploaded_before"] c.Show_hashtags = True c.Store_object = True twint.run.Search(c) tweets = to_serializable(twint.output.tweets_list, as_list=True) return LocalElementsIndex(tweets)
def read_elements_index(self, q: str) -> LocalElementsIndex: dest = self.read_query(q) def get_rows(): with open(dest / self.ELEMENTS_INDEX_FILE, "r", encoding="utf-8") as f: reader = csv.reader(f) for idx, row in enumerate(reader): if idx == 0: self.headers = row continue obj = Ns() allvls = dict(zip(self.headers, row)) obj = Ns(**allvls) yield obj return LocalElementsIndex(rows=get_rows())
def index(self, config): viable_boards = [ "a", "aco", "adv", "an", "asp", "b", "bant", "biz", "c", "cgl", "ck", "cm", "co", "d", "diy", "e", "f", "fa", "fit", "g", "gd", "gif", "h", "hc", "his", "hm", "hr", "i", "ic", "int", "jp", "k", "lgbt", "lit", "m", "mlp", "mu", "n", "news", "o", "out", "p", "po", "pol", "qa", "qst", "r", "r9k", "s", "s4s", "sci", "soc", "sp", "t", "tg", "toy", "trash", "trv", "tv", "u", "v", "vg", "vip", "vp", "vr", "w", "wg", "wsg", "wsr", "x", "y", ] results = [] board = config["board"] if board not in viable_boards: self.error_logger("Your chosen board does not exist on 4chan!") quit() # Create a HTML parser for parsing comments h = html2text.HTML2Text() h.ignore_links = False req = f"https://a.4cdn.org/{board}/threads.json" content = json.loads(requests.get(req).content) max_pages = max(1, min(len(content), int(config["max_pages"]))) for page_index in range(max_pages): page = content[page_index] self.logger(f"Scraping page number: {page_index+1}") for thread_index, threads in enumerate(page["threads"]): self.logger( f"Extracting posts from thread number: {thread_index+1}") thread_id = threads["no"] req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json" thread_content = json.loads(requests.get(req).content)[ "posts"] # thread content is a list of posts for post_index, post in enumerate(thread_content): self.logger( f"Extracting media and comments from post number: {post_index+1}" ) post_row = [] post_row.append(post["no"]) post_row.append(thread_id) post_row.append(post["time"]) try: comment = post["com"] except KeyError: comment = "..." else: comment = h.handle(comment) post_row.append(comment) # Filename try: filename = post["filename"] except KeyError: filename = "" if filename != "": time_id = post["tim"] extension = post["ext"] full_file = f"{filename}{extension}" file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}" post_row.append(full_file) post_row.append(extension) post_row.append(file_url) elif filename == "": post_row.append("") post_row.append("") post_row.append("") results.append(post_row) self.logger("Scraping metadata complete") results.insert(0, [ "id", "thread_id", "datetime", "comment", "filename", "ext", "url" ]) return LocalElementsIndex(results)
def index(self, config): results = [] board = config["board"] if board not in viable_boards: self.error_logger("Your chosen board does not exist on 4chan!") quit() # Create a HTML parser for parsing comments h = html2text.HTML2Text() h.ignore_links = False req = f"https://a.4cdn.org/{board}/threads.json" content = json.loads(requests.get(req).content) for page_index, page in enumerate(content): self.logger(f"Scraping page number: {page_index+1}") for thread_index, threads in enumerate(page["threads"]): self.logger( f"Extracting posts from thread number: {thread_index+1}") thread_id = threads["no"] req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json" thread_content = json.loads(requests.get(req).content)[ "posts"] # thread content is a list of posts for post_index, post in enumerate(thread_content): self.logger( f"Extracting media and comments from post number: {post_index+1}" ) post_row = [] post_row.append(post["no"]) post_row.append(thread_id) post_row.append(post["time"]) try: comment = post["com"] except KeyError: comment = "..." else: comment = h.handle(comment) post_row.append(comment) # Filename try: filename = post["filename"] except KeyError: filename = "" if filename != "": time_id = post["tim"] extension = post["ext"] full_file = f"{filename}{extension}" file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}" post_row.append(full_file) post_row.append(extension) post_row.append(file_url) elif filename == "": post_row.append("") post_row.append("") post_row.append("") results.append(post_row) self.logger("Scraping metadata complete") results.insert(0, [ "id", "thread_id", "datetime", "comment", "filename", "ext", "url" ]) return LocalElementsIndex(results)