def add_related_titles_from_file(self, filename, related_ns_list, ns_list):
        """Read list of titles from file, for those in one of the
        specified namespaces, convert the title to one from its related
        namespace (i.e. if it was in Category talk, convert to Category,
        if it was in File talk, convert to File, etc.) and add to title
        list and dict. Arguments:
        filename       -- full path to list of titles
        related_ns_list  -- list of namespaces wanted, e.g. ["4", "6", "12"]
        ns_list         -- list of namespaces to convert from, in the same order as the
                          related NsList, e.g. ["5", "7", "13"]"""

        # don't pass templates in here, we do those separately
        # because it could be a huge list and we want the user
        # to be able to save and reuse it
        fd = File.open_input(filename)
        for line in fd:
            line = line.strip()
            sep = line.find(":")
            if sep != -1:
                prefix = line[:sep]
                if prefix in self.ns_dict_by_string:
                    # main, file, category, project talk namespaces
                    if self.ns_dict_by_string[prefix] in related_ns_list:
                        no_prefix_title = line[sep + 1:]
                        # convert to file, category, project namespace
                        related_ns = str(int(self.ns_dict_by_string[prefix]) - 1)
                        if (self.ns_dict[related_ns]):
                            new_title = self.ns_dict[related_ns] + ":" + no_prefix_title
                        else:
                            new_title = no_prefix_title  # main namespace titles
                        self.list.append(new_title)
                        if no_prefix_title in self.dict:
                            self.dict[no_prefix_title][related_ns] = True
                        else:
                            self.dict[no_prefix_title] = {related_ns: True}
                    # file, category, project talk namespaces
                    elif self.ns_dict_by_string[prefix] in ns_list:
                        ns = self.ns_dict_by_string[prefix]
                        no_prefix_title = line[sep + 1:]
                        self.list.append(no_prefix_title)
                        if no_prefix_title in self.dict:
                            self.dict[no_prefix_title][ns] = True
                        else:
                            self.dict[no_prefix_title] = {ns: True}
            elif "0" in ns_list:
                # main namespace, won't be caught above
                self.list.append(line)
                if line in self.dict:
                    self.dict[line]["0"] = True
                else:
                    self.dict[line] = {"0": True}
        fd.close()
 def get_titles_dict(self, sql_file):
     """Arguments:
     sql_file         -- file containing pageid whitespace nsnum whitespace pagetitle where the title
                        is expected to be sql escaped and can be enclosed with single quotes"""
     fd = File.open_input(sql_file)
     t = {}
     for line in fd:
         (pageid, ns, title) = line.split(' ', 3)
         ns = int(ns)
         if title in t:
             t[title][ns] = pageid
         else:
             t[title] = {ns: pageid}
     return t
 def write_sql(self):
     self.user_dict = {1: True}
     fd = File.open_input(self.xml_file)
     logout_fd = File.open_output(self.log_out_file)
     if self.user_out_file:
         userout_fd = File.open_output(self.user_out_file)
     else:
         userout_fd = None
     if not self.skip_header(fd):
         raise WikiContentErr(
             "failed to find end of mediawiki/siteinfo header in xml file\n"
         )
     eof = False
     while not eof:
         eof = self.do_log_item(fd, logout_fd, userout_fd)
     fd.close()
     logout_fd.close()
     if self.user_out_file:
         userout_fd.close()
     return
    def add_titles_from_file(self, filename, ns):
        """add titles from a file to the title list and dict.
        Note that template titles get added to a different title list
        than the rest, for separate processing
        Arguments:
        filename   -- full path to file containing page titles
        ns         -- number (string of digits) of namespace of page titles to
                      grab from file"""

        fd = File.open_input(filename)
        prefix = self.ns_dict[ns] + ":"
        prefix_len = len(prefix)
        for line in fd:
            if line.startswith(prefix):
                if ns == "10":  # special case bleah
                    self.list_templates.append(line[:-1])  # lose newline
                else:
                    self.list.append(line[:-1])  # lose newline
                no_prefix_title = line[prefix_len:-1]
                if no_prefix_title in self.dict:
                    self.dict[no_prefix_title][ns] = True
                else:
                    self.dict[no_prefix_title] = {ns: True}
    def write_stub_and_page_ids(self, content_path, stubs_path, page_ids_path):
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        content_path  -- path to the XML page content file to read
        stubs_path    -- path to the stubs file to write
        page_ids_path  -- path to the page ids file to write"""

        page_pattern = "^\s*<page>"
        compiled_page_pattern = re.compile(page_pattern)
        revision_pattern = "^\s*<revision>"
        compiled_revision_pattern = re.compile(revision_pattern)
        id_pattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiled_id_pattern = re.compile(id_pattern)
        text_pattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiled_text_pattern = re.compile(text_pattern)

        in_fd = File.open_input(content_path)
        out_fd = File.open_output(stubs_path)
        outpage_id_fd = File.open_output(page_ids_path)
        current_title = None
        current_text_id = None
        page_id = None

        expect_rev_id = False
        expect_page_id = False

        for line in in_fd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiled_text_pattern.match(line)
                if result:
                    line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (
                        current_text_id, result.group("b"))
                    out_fd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiled_page_pattern.match(line)
                if result:
                    expect_page_id = True
                    out_fd.write(line)
                    continue
                result = compiled_revision_pattern.match(line)
                if result:
                    expect_rev_id = True
                    out_fd.write(line)
                    continue
                if expect_page_id:
                    result = compiled_id_pattern.match(line)
                    if result:
                        outpage_id_fd.write("1:%s\n" % result.group("i"))
                        expect_page_id = False
                    out_fd.write(line)
                    continue
                if expect_rev_id:
                    result = compiled_id_pattern.match(line)
                    if result:
                        current_text_id = result.group("i")
                        expect_rev_id = False
                    out_fd.write(line)
                    continue
                out_fd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        in_fd.close()
        out_fd.close()
        outpage_id_fd.close()