Exemplo n.º 1
0
def ls(project_root):
    """
    Return a list of real, absolute paths to all the git-controlled
    files under the project root.
    """
    git_root = git.find_git_root(project_root)

    # --full-tree = allow absolute path for final argument (pathname)
    #
    # --name-only = don't show the git id for the object, just the
    #   file name
    #
    # -r = recurse into subdirs
    #
    # -z = null byte separate listings
    git_cmd_s = 'ls-tree --full-tree --name-only -r -z HEAD'
    # don't add the project root until after the split, in case it
    # contains spaces.
    git_cmd = git_cmd_s.split()
    git_cmd.append(project_root)
    with git.git_cmd(cmd=git_cmd, cwd=git_root) as out_f:
        fnames_z = out_f.read()
        return [
            util.real_abs_path(fname=fname, parent=git_root)
            for fname in fnames_z.split('\0')
            # don't show '', which is just the root of the repo.
            if fname
        ]
Exemplo n.º 2
0
def ls(project_root):
    """
    Return a list of real, absolute paths to all the git-controlled
    files under the project root.
    """
    git_root = git.find_git_root(project_root)

    # --full-tree = allow absolute path for final argument (pathname)
    #
    # --name-only = don't show the git id for the object, just the
    #   file name
    #
    # -r = recurse into subdirs
    #
    # -z = null byte separate listings
    git_cmd_s = 'ls-tree --full-tree --name-only -r -z HEAD'
    # don't add the project root until after the split, in case it
    # contains spaces.
    git_cmd = git_cmd_s.split()
    git_cmd.append(project_root)
    with git.git_cmd(cmd=git_cmd, cwd=git_root) as out_f:
        fnames_z = out_f.read()
        return [util.real_abs_path(fname=fname, parent=git_root)
                for fname
                in fnames_z.split('\0')
                # don't show '', which is just the root of the repo.
                if fname]
Exemplo n.º 3
0
def find_git_root(git_repo_or_subdir):
    """
    Returns a real, absolute path to the git root, assuming that
    `git_repo_or_subdir` is a real, absolute path to either a git repo
    or subdir under it.
    """
    cmd = 'rev-parse --show-toplevel'.split()
    with git_cmd(cmd, cwd=git_repo_or_subdir) as out_f:
        git_root = out_f.read().strip()
    return util.real_abs_path(git_root)
Exemplo n.º 4
0
def excavate(project_dir,
             log_cache_dir,
             interesting_fnames_res,
             boring_fnames_res,
             fact_finders,
             summarizer,
             num_procs=1,
             use_cached_logs=True):
    """
    Extract the git logs for all the interesting files in
    `project_dir`, running each file's logs through all the supplied
    `fact_finders` and passing the generated facts into the supplied
    `summarizer`.

    If `summarizer` is None, the facts will be printed to standard
    out.

    - `project_dir`: the root directory of the project to excavate

    - `log_cache_dir`: the directory where the null terminated git logs
      will be written for later fact finding

    - `interesting_fnames_res`: the regular expressions that will be
      passed to is_interesting_fname to determine whether a given file
      in the project is interesting

    - `boring_fnames_res`: the regular expressions that will be passed
      to is_interesting_fname to determine whether a given file in the
      project is interesting

    - `fact_finders`: a list whose elements are either strs or lists
      of strs, each str representing a fully qualified function named
      (e.g. 'one.two.func'), which must be importable (i.e. somewhere
      in the python path), and each list of strs representing an
      external exe to invoke.

      For each log entry of each interesting file in the project dir,
      a function will be passed (fname, log_entry), where fname is the
      name of the file relative to project dir, and log_entry is a
      git_log.LogEntry named tuple.

      An external exe will receive the fname and fields of the
      log_entry on stdin, separated by null bytes.  The fields will
      appear in the same order they are declared in git_log.LogEntry.

      It is guaranteed that for a given fname, each log entry will be
      passed to the fact finders in chronological order, in the same
      process.

      The fact finders can return anything that can be serialized
      across python processes, but note that if you provide a
      `summarizer`, the summarizer must handle whatever a fact finder
      might return, and if you do not provide a summarizer, whatever
      the fact finders return must be sensibly printable to stdout.

    - `summarizer`: a callable that will be called repeatedly, once
      for each generated fact

    - `num_procs`: how many parallel processes to use when generating
      logs and facts.  Note that the logs are generated with calls to
      'git', and are relatively CPU and disk intensive.  Generally you
      can up this number until you're maxing out your disk, past which
      you won't see performance improvements.

    Returns `summarizer`.
    """
    pool = multiprocessing.Pool(num_procs)

    project_dir = util.real_abs_path(project_dir)

    fnames_to_excavate = _interesting_fnames_in_proj(project_dir,
                                                     interesting_fnames_res,
                                                     boring_fnames_res)
    log.info("Found %d interesting fnames", len(fnames_to_excavate))
    log.debug("Interesting fnames: %s", fnames_to_excavate)

    rel_and_log_z_fnames = _extract_logs(pool, fnames_to_excavate, project_dir,
                                         log_cache_dir, use_cached_logs)

    facts_async_results = []
    for (rel_name, log_z_fname) in rel_and_log_z_fnames:
        facts_async_results.append(
            pool.apply_async(_find_facts,
                             (rel_name, log_z_fname, fact_finders)))

    for res in facts_async_results:
        facts = res.get(REALLY_LONG_TIME)
        for fact in facts:
            summarizer(fact)

    pool.close()
    pool.join()

    return summarizer
Exemplo n.º 5
0
def excavate(project_dir, log_cache_dir,
             interesting_fnames_res,
             boring_fnames_res,
             fact_finders, summarizer, num_procs=1,
             use_cached_logs=True):
    """
    Extract the git logs for all the interesting files in
    `project_dir`, running each file's logs through all the supplied
    `fact_finders` and passing the generated facts into the supplied
    `summarizer`.

    If `summarizer` is None, the facts will be printed to standard
    out.

    - `project_dir`: the root directory of the project to excavate

    - `log_cache_dir`: the directory where the null terminated git logs
      will be written for later fact finding

    - `interesting_fnames_res`: the regular expressions that will be
      passed to is_interesting_fname to determine whether a given file
      in the project is interesting

    - `boring_fnames_res`: the regular expressions that will be passed
      to is_interesting_fname to determine whether a given file in the
      project is interesting

    - `fact_finders`: a list whose elements are either strs or lists
      of strs, each str representing a fully qualified function named
      (e.g. 'one.two.func'), which must be importable (i.e. somewhere
      in the python path), and each list of strs representing an
      external exe to invoke.

      For each log entry of each interesting file in the project dir,
      a function will be passed (fname, log_entry), where fname is the
      name of the file relative to project dir, and log_entry is a
      git_log.LogEntry named tuple.

      An external exe will receive the fname and fields of the
      log_entry on stdin, separated by null bytes.  The fields will
      appear in the same order they are declared in git_log.LogEntry.

      It is guaranteed that for a given fname, each log entry will be
      passed to the fact finders in chronological order, in the same
      process.

      The fact finders can return anything that can be serialized
      across python processes, but note that if you provide a
      `summarizer`, the summarizer must handle whatever a fact finder
      might return, and if you do not provide a summarizer, whatever
      the fact finders return must be sensibly printable to stdout.

    - `summarizer`: a callable that will be called repeatedly, once
      for each generated fact

    - `num_procs`: how many parallel processes to use when generating
      logs and facts.  Note that the logs are generated with calls to
      'git', and are relatively CPU and disk intensive.  Generally you
      can up this number until you're maxing out your disk, past which
      you won't see performance improvements.

    Returns `summarizer`.
    """
    pool = multiprocessing.Pool(num_procs)

    project_dir = util.real_abs_path(project_dir)

    fnames_to_excavate = _interesting_fnames_in_proj(project_dir,
                                                     interesting_fnames_res,
                                                     boring_fnames_res)
    log.info("Found %d interesting fnames", len(fnames_to_excavate))
    log.debug("Interesting fnames: %s", fnames_to_excavate)

    rel_and_log_z_fnames = _extract_logs(pool,
                                         fnames_to_excavate,
                                         project_dir,
                                         log_cache_dir,
                                         use_cached_logs)

    facts_async_results = []
    for (rel_name, log_z_fname) in rel_and_log_z_fnames:
        facts_async_results.append(pool.apply_async(_find_facts,
                                                    (rel_name,
                                                     log_z_fname,
                                                     fact_finders)))

    for res in facts_async_results:
        facts = res.get(REALLY_LONG_TIME)
        for fact in facts:
            summarizer(fact)

    pool.close()
    pool.join()

    return summarizer