def add_related_titles_from_file(self, filename, related_ns_list, ns_list): """Read list of titles from file, for those in one of the specified namespaces, convert the title to one from its related namespace (i.e. if it was in Category talk, convert to Category, if it was in File talk, convert to File, etc.) and add to title list and dict. Arguments: filename -- full path to list of titles related_ns_list -- list of namespaces wanted, e.g. ["4", "6", "12"] ns_list -- list of namespaces to convert from, in the same order as the related NsList, e.g. ["5", "7", "13"]""" # don't pass templates in here, we do those separately # because it could be a huge list and we want the user # to be able to save and reuse it fd = File.open_input(filename) for line in fd: line = line.strip() sep = line.find(":") if sep != -1: prefix = line[:sep] if prefix in self.ns_dict_by_string: # main, file, category, project talk namespaces if self.ns_dict_by_string[prefix] in related_ns_list: no_prefix_title = line[sep + 1:] # convert to file, category, project namespace related_ns = str(int(self.ns_dict_by_string[prefix]) - 1) if (self.ns_dict[related_ns]): new_title = self.ns_dict[related_ns] + ":" + no_prefix_title else: new_title = no_prefix_title # main namespace titles self.list.append(new_title) if no_prefix_title in self.dict: self.dict[no_prefix_title][related_ns] = True else: self.dict[no_prefix_title] = {related_ns: True} # file, category, project talk namespaces elif self.ns_dict_by_string[prefix] in ns_list: ns = self.ns_dict_by_string[prefix] no_prefix_title = line[sep + 1:] self.list.append(no_prefix_title) if no_prefix_title in self.dict: self.dict[no_prefix_title][ns] = True else: self.dict[no_prefix_title] = {ns: True} elif "0" in ns_list: # main namespace, won't be caught above self.list.append(line) if line in self.dict: self.dict[line]["0"] = True else: self.dict[line] = {"0": True} fd.close()
def get_titles_dict(self, sql_file): """Arguments: sql_file -- file containing pageid whitespace nsnum whitespace pagetitle where the title is expected to be sql escaped and can be enclosed with single quotes""" fd = File.open_input(sql_file) t = {} for line in fd: (pageid, ns, title) = line.split(' ', 3) ns = int(ns) if title in t: t[title][ns] = pageid else: t[title] = {ns: pageid} return t
def write_sql(self): self.user_dict = {1: True} fd = File.open_input(self.xml_file) logout_fd = File.open_output(self.log_out_file) if self.user_out_file: userout_fd = File.open_output(self.user_out_file) else: userout_fd = None if not self.skip_header(fd): raise WikiContentErr( "failed to find end of mediawiki/siteinfo header in xml file\n" ) eof = False while not eof: eof = self.do_log_item(fd, logout_fd, userout_fd) fd.close() logout_fd.close() if self.user_out_file: userout_fd.close() return
def add_titles_from_file(self, filename, ns): """add titles from a file to the title list and dict. Note that template titles get added to a different title list than the rest, for separate processing Arguments: filename -- full path to file containing page titles ns -- number (string of digits) of namespace of page titles to grab from file""" fd = File.open_input(filename) prefix = self.ns_dict[ns] + ":" prefix_len = len(prefix) for line in fd: if line.startswith(prefix): if ns == "10": # special case bleah self.list_templates.append(line[:-1]) # lose newline else: self.list.append(line[:-1]) # lose newline no_prefix_title = line[prefix_len:-1] if no_prefix_title in self.dict: self.dict[no_prefix_title][ns] = True else: self.dict[no_prefix_title] = {ns: True}
def write_stub_and_page_ids(self, content_path, stubs_path, page_ids_path): """Write an XML stub file (omitting text content) and a list of page ids, from a MediaWiki XML page content file. Arguments: content_path -- path to the XML page content file to read stubs_path -- path to the stubs file to write page_ids_path -- path to the page ids file to write""" page_pattern = "^\s*<page>" compiled_page_pattern = re.compile(page_pattern) revision_pattern = "^\s*<revision>" compiled_revision_pattern = re.compile(revision_pattern) id_pattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" compiled_id_pattern = re.compile(id_pattern) text_pattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"' compiled_text_pattern = re.compile(text_pattern) in_fd = File.open_input(content_path) out_fd = File.open_output(stubs_path) outpage_id_fd = File.open_output(page_ids_path) current_title = None current_text_id = None page_id = None expect_rev_id = False expect_page_id = False for line in in_fd: # FIXME we could jus calculate text len if the output is missing # the bytes attr. (as in dumps not from Special:Export) # format in content file: # <text <text xml:space="preserve" bytes="78"> # format wanted for stubs file: # <text id="11248" bytes="9" /> if '<' in line: result = compiled_text_pattern.match(line) if result: line = result.group("s") + '<text id="%s" bytes="%s" />\n' % ( current_text_id, result.group("b")) out_fd.write(line) continue elif '</text' in line: continue result = compiled_page_pattern.match(line) if result: expect_page_id = True out_fd.write(line) continue result = compiled_revision_pattern.match(line) if result: expect_rev_id = True out_fd.write(line) continue if expect_page_id: result = compiled_id_pattern.match(line) if result: outpage_id_fd.write("1:%s\n" % result.group("i")) expect_page_id = False out_fd.write(line) continue if expect_rev_id: result = compiled_id_pattern.match(line) if result: current_text_id = result.group("i") expect_rev_id = False out_fd.write(line) continue out_fd.write(line) else: continue # these are lines of text, we can skip them in_fd.close() out_fd.close() outpage_id_fd.close()