Exemplo n.º 1
0
    def build(self, outfile_name, infile_names, changed, context):
        """Download .index and .chunk files from prod.

        CompilePOFile takes a long time to compute.  So when not on jenkins we
        call this rule instead to fetch from prod what is there.
        """
        if self._locale_paths is None:
            self._init_locale_paths()

        log.v2("Determining latest prod translation files for %s" %
               context['{lang}'])

        locale = context['{lang}']
        locale_path = 'gs://ka_translations/%s/' % locale
        if locale_path not in self.locale_paths:
            raise NoSuchLocaleCompileFailure(locale)

        try:
            stdout = self.call_with_output(['gsutil', 'ls', locale_path])
        except compile_rule.CompileFailure, e:
            # TODO(james): make sure we download gcloud and gsutil as part
            # of the khan-dotfiles setup.
            raise compile_rule.CompileFailure(
                "%s.\nFailed to download translations from gcs. Make sure "
                "that you have gsutil installed via gcloud." % e)
Exemplo n.º 2
0
def _read_pofile(filename):
    """Read from filename, a pickled polib.POFile, and return it."""
    log.v2('Reading from %s', filename)
    try:
        with open(filename) as f:
            return cPickle.load(f)
    except (IOError, OSError):
        return None
Exemplo n.º 3
0
 def _download_from_s3(gitbigfile_module, outfile_abspath, sha):
     s3_fetcher = gitbigfile_module.GitBigfile().transport()
     log.v2('Downloading s3://%s/%s to %s' % (
         s3_fetcher.bucket.name, sha, outfile_abspath + '.tmp'))
     s3_fetcher.get(sha, outfile_abspath + '.tmp')
     # Make sure we don't create the 'real' file until it's fully
     # downloaded.
     try:
         os.unlink(outfile_abspath)
     except (IOError, OSError):
         pass    # probably "file not found"
     try:
         os.rename(outfile_abspath + '.tmp', outfile_abspath)
     except OSError:
         log.v1('Error fetching %s' % outfile_abspath)
         raise
Exemplo n.º 4
0
def _write_pofile(po_entries, filename, write_debug_file_to=None):
    """Write a polib.POFile to filename.

    The po-file format is nicely human-readable, but slow to parse.
    The mo-file format is faster to parse, but loses important
    information.  So we introduce a *third* format: pickled
    polib.POFile.  Whenever we save a pofile to disk, we save a
    pickled form of the python data structure (polib.POFile).

    We also normalize the po-entries before writing the file, to
    minimize diffs.

    Arguments:
       po_entries: a list of of POEntry objects.
       filename: an absolute path to write the pofile to.
       write_debug_file_to: if not None, a filename to write the po_entries
          as a (human-readable) po-file, rather than a po.pickle file.
    """
    from intl import polib_util

    output_pot = polib_util.pofile()
    output_pot.extend(po_entries)

    # sort the po-entries in a canonical order, to make diff-ing
    # easier, but that tries to keep content close together in the
    # file if it's close together in real life.  We sort by first
    # occurrence (alphabetically), which is good for most content,
    # but not for datastore entities, which all have the same
    # occurrence (_DATASTORE_FILE:1).  For them, we sort by first
    # url-they-appear-in.  For entities that match on all of these
    # things, we depend on the fact python's sorts are stable to
    # keep them in input order (that is, the order that we extracted
    # them from the input ifle).
    url_re = re.compile('<http[^>]*>')
    output_pot.sort(key=lambda e: (e.occurrences[0][0], int(e.occurrences[0][
        1]), sorted(url_re.findall(e.comment))[:1]))

    log.v2('Writing to %s', filename)
    with open(filename, 'w') as f:
        cPickle.dump(output_pot, f, protocol=cPickle.HIGHEST_PROTOCOL)

    if write_debug_file_to:
        log.v2('Also writing to %s', write_debug_file_to)
        with open(write_debug_file_to, 'w') as f:
            polib_util.write_pofile(output_pot, f)

    log.v3('Done!')
Exemplo n.º 5
0
class DownloadIndex(compile_rule.CompileBase):
    def __init__(self):
        super(DownloadIndex, self).__init__()
        self._locale_paths = None

    def version(self):
        """Update every time build() changes in a way that affects output."""
        import datetime
        # Force redownloading once a month.
        return datetime.datetime.now().strftime("%Y-%m")

    def build(self, outfile_name, infile_names, changed, context):
        """Download .index and .chunk files from prod.

        CompilePOFile takes a long time to compute.  So when not on jenkins we
        call this rule instead to fetch from prod what is there.
        """
        if self._locale_paths is None:
            self._init_locale_paths()

        log.v2("Determining latest prod translation files for %s" %
               context['{lang}'])

        locale = context['{lang}']
        locale_path = 'gs://ka_translations/%s/' % locale
        if locale_path not in self.locale_paths:
            raise NoSuchLocaleCompileFailure(locale)

        try:
            stdout = self.call_with_output(['gsutil', 'ls', locale_path])
        except compile_rule.CompileFailure, e:
            # TODO(james): make sure we download gcloud and gsutil as part
            # of the khan-dotfiles setup.
            raise compile_rule.CompileFailure(
                "%s.\nFailed to download translations from gcs. Make sure "
                "that you have gsutil installed via gcloud." % e)
        dirs = stdout.split()

        if dirs:
            most_recent_dir = dirs[-1]
            log.v2("Downloading latest prod files from %s" %
                   most_recent_dir)
            self.call(
                ['gsutil', '-m', 'cp', '-r', "%s*" % most_recent_dir,
                 os.path.dirname(outfile_name)])

            return

        # No translation files found on gcs ... lets complain
        raise compile_rule.CompileFailure(
            "Failed to find translation files for %s on gcs" %
            context['{lang}'])
Exemplo n.º 6
0
    def build(self, outfile_name, infile_names, changed, context):
        # The infiles here are genfiles/extracted_string/foo.pot.pickle
        # Copy unchanged messages from the existing all.pot, if possible.
        po_entries = collections.OrderedDict()
        if outfile_name in changed or changed == infile_names:
            log.v1('Regenerating %s from scratch (it changed on us!)' %
                   outfile_name)
            changed = infile_names  # everything changed
        else:
            # Extract unchanged messages from the existing all.pot
            existing_all_pot = _read_pofile(self.abspath(outfile_name))
            if existing_all_pot:  # we found an existing file
                log.v2('Loading existing messages')

                # We don't care about deleted files: those that
                # existed in the last call to build() but don't exist
                # now.  (They'll be removed from all.pot by default.)
                # Get rid of them from 'changed' so they don't gum up
                # the code below.
                changed = [f for f in changed if f in infile_names]

                # Elements in infile_names and changed look like
                # 'genfiles/extracted_strings/en/foo.pot.pickle'. Here,
                # we want the version of infiles/changed that are just
                # 'foo'.  We use the _input_map to get that mapping.
                orig_infiles = set(context['_input_map'][f][0]
                                   for f in infile_names)
                # f might not be in _input_map if it's ben deleted.
                orig_changed = set(context['_input_map'][f][0]
                                   for f in changed)
                unchanged = orig_infiles - orig_changed
                for entry in existing_all_pot:
                    # Get rid of occurrences for files that no longer exist.
                    # TODO(csilvers): get rid of comments in the same way.
                    entry.occurrences = [
                        occ for occ in entry.occurrences if occ[0] in unchanged
                    ]
                    # If the msgid still exists at all, let's keep it!
                    if entry.occurrences:
                        po_entries[entry.msgid] = entry
            else:
                changed = infile_names

        log.v2('Extracting new and changed messages')
        for filename in changed:
            input_pot = _read_pofile(self.abspath(filename))
            for poentry in input_pot:
                if poentry.msgid in po_entries:
                    existing_poentry = po_entries[poentry.msgid]
                    _merge_poentry(existing_poentry, poentry)
                else:
                    po_entries[poentry.msgid] = poentry

        log.v2('Writing merged output')
        _write_pofile(po_entries.itervalues(),
                      self.abspath(outfile_name),
                      write_debug_file_to=self.abspath(
                          outfile_name.replace('.pickle',
                                               '.txt_for_debugging')))
Exemplo n.º 7
0
def _update_image_url_info(css_filename, image_url_info):
    """Given css_filenames relative to ka-root, update _IMAGE_URL_INFO.

    Returns:
        A list of image filenames, relative to ka-root, mentioned in
        this css-filename.
    """
    # First, we need to delete all old references to css_filenames.
    for file_info in image_url_info.itervalues():
        new_files = [f for f in file_info[0] if f != css_filename]
        if len(new_files) < len(file_info[0]):
            # We go through this contortion so we can edit the list in place.
            del file_info[0][:]
            file_info[0].extend(new_files)

    # If the file no longer exists (has been deleted), we're done!
    if not os.path.exists(ka_root.join(css_filename)):
        log.v3("removing image-url info for %s: it's been deleted",
               css_filename)
        return

    # Then, we need to add updated references, based on the current
    # file contents.
    log.v2('Parsing image-urls from %s', css_filename)
    with open(ka_root.join(css_filename)) as f:
        content = f.read()

    retval = []
    for (img_url, img_relpath,
         img_size) in (_image_urls_and_file_info(content)):
        image_url_info.setdefault(img_url, ([], img_relpath, img_size))
        image_url_info[img_url][0].append(css_filename)
        retval.append(img_relpath)

    log.v4('Image-url info: %s', retval)
    return retval
Exemplo n.º 8
0
    def build_many(self, outfile_infiles_changed_context):
        from shared.testutil import fake_datetime

        sha_to_files = {}            # for the files we need to get from S3
        for (outfile, infiles, _, context) in outfile_infiles_changed_context:
            assert len(infiles) == 1, infiles
            assert infiles[0].startswith('intl/translations/')

            with open(self.abspath(infiles[0])) as f:
                head = f.read(64).strip()

            # Does the head look like a sha1?  (sha1's are only 40 bytes.)
            # If so, store it for later.  If not, take care of it now.
            if head.strip('0123456789abcdefABCDEF') == '':
                sha_to_files.setdefault(head, []).append(outfile)
            else:
                # Nope, not a sha1.  NOTE: We could also use a hard-link,
                # but that could fail if genfiles is on a different
                # filesystem from the source.  Copying is more expensive
                # but safer.  Symlinks are right out.
                shutil.copyfile(self.abspath(infiles[0]),
                                self.abspath(outfile))

        if not sha_to_files:
            return

        # We could just call 'git bigfile pull' but we purposefully
        # don't so as to leave untouched the file-contents in
        # intl/translations.  This works better with kake, which
        # doesn't like it when input contents change as part of a kake
        # rule.
        self._munge_sys_path()     # so the following import succeeds
        import gitbigfile.command

        # Download all our files from S3 in parallel.  We store these
        # files under a 'permanent' name based on the sha1.  (Later
        # we'll copy these files to outfile_name.)  That way even if
        # you check out a different branch and come back to this one
        # again, you can get the old contents without needing to
        # revisit S3.
        # GitBigfile() (in _download_from_s3) runs 'git' commands in a
        # subprocess, so we need to be in the right repository for that.
        old_cwd = os.getcwd()
        os.chdir(self.abspath('intl/translations'))
        try:
            # This will actually try to download translation files via
            # bigfile.  This requires a real datetime for making the
            # api requests to S3 (S3 complains about weird dates).
            with fake_datetime.suspend_fake_datetime():
                arglists = []
                for (sha, outfiles) in sha_to_files.iteritems():
                    # Typically a given sha will have only one outfile,
                    # but for some shas (an empty po-file, e.g.), many
                    # outfiles may share the same sha!
                    log.v1('Fetching %s from S3' % ' '.join(outfiles))
                    # We just need to put this in a directory we know we
                    # can write to: take one of the outfile dirs arbitrarily.
                    sha_name = os.path.join(os.path.dirname(outfiles[0]), sha)
                    arglists.append(
                        (gitbigfile.command, self.abspath(sha_name), sha))
                shared.util.thread.run_many_threads(
                    self._download_from_s3, arglists)
        except RuntimeError as why:
            log.error(why)    # probably misleading, but maybe helpful
            # TODO(csilvers): check whether git-bigfile *is* set up
            # correctly, and give a more precise failure message if so.
            raise compile_rule.CompileFailure(
                "Failed to download translation file for %s from S3. "
                "Make sure you have git-bigfile set up as per the "
                "configs in the khan-dotfiles repo: namely, the "
                "'bigfile' section in .gitconfig.khan, and the "
                "update_credentials() section in setup.sh." % outfile)
        finally:
            os.chdir(old_cwd)

        # Now copy from the sha-name to the actual output filename.
        for (sha, outfiles) in sha_to_files.iteritems():
            sha_name = os.path.join(os.path.dirname(outfiles[0]), sha)
            for outfile in outfiles:
                log.v2('Copying from %s to %s' % (sha_name, outfile))
                try:
                    os.unlink(self.abspath(outfile))
                except OSError:
                    pass     # probably file not found
                os.link(self.abspath(sha_name), self.abspath(outfile))