示例#1
0
  def GetImportRelpaths(
    self, contentfile_root: pathlib.Path
  ) -> typing.List[str]:
    """Get relative paths to all files in the content files directory.

    Args:
      contentfile_root: The root of the content files directory.

    Returns:
      A list of paths relative to the content files root.

    Raises:
      EmptyCorpusException: If the content files directory is empty.
    """
    with fs.chdir(contentfile_root):
      find_output = (
        subprocess.check_output(["find", ".", "-type", "f"])
        .decode("utf-8")
        .strip()
      )
      if not find_output:
        raise errors.EmptyCorpusException(
          f"Empty content files directory: '{contentfile_root}'"
        )
      return find_output.split("\n")
示例#2
0
    def Create(self) -> None:
        """Create the corpus files.

    Raises:
      EmptyCorpusException: If there are no content files, or no successfully
        pre-processed files.
    """
        self._created = True
        logging.info('Content ID: %s', self.content_id)
        preprocessed_lock_path = self.preprocessed.database_path.parent / 'LOCK'
        with lockfile.LockFile(preprocessed_lock_path).acquire(
                replace_stale=True, block=True):
            self.preprocessed.Create(self.config)
        if not self.preprocessed.size:
            raise errors.EmptyCorpusException(
                "Pre-processed corpus contains no files: "
                f"'{self.preprocessed.database_path}'")
        encoded_lock_path = self.encoded.database_path.parent / 'LOCK'
        with lockfile.LockFile(encoded_lock_path).acquire(replace_stale=True,
                                                          block=True):
            start_time = time.time()
            atomizer = self.atomizer
            logging.info(
                '%s: %s tokens in %s ms',
                type(atomizer).__name__,
                humanize.intcomma(atomizer.vocab_size),
                humanize.intcomma(int((time.time() - start_time) * 1000)))
            for key, value in atomizer.vocab.items():
                logging.info('atomizer.vocab  %s : %s', key, value)
            self.encoded.Create(self.preprocessed, atomizer,
                                self.config.contentfile_separator)
示例#3
0
  def Create(self) -> None:
    """Create the corpus files.

    Raises:
      EmptyCorpusException: If there are no content files, or no successfully
        pre-processed files.
    """
    self._created = True
    app.Log(1, 'Content ID: %s', self.content_id)

    # Nothing to do for already-encoded databases.
    # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this after splitting
    # out Corpus class.
    if self.config.HasField('pre_encoded_corpus_url'):
      return

    preprocessed_lock_path = pathlib.Path(
        self.preprocessed.url[len('sqlite:///'):]).parent / 'LOCK'
    with lockfile.LockFile(preprocessed_lock_path):
      self.preprocessed.Create(self.config)
    if not self.preprocessed.size:
      raise errors.EmptyCorpusException(
          f"Pre-processed corpus contains no files: '{self.preprocessed.url}'")
    encoded_lock_path = pathlib.Path(
        self.encoded.url[len('sqlite:///'):]).parent / 'LOCK'
    with lockfile.LockFile(encoded_lock_path):
      start_time = time.time()
      atomizer = self.atomizer
      app.Log(1, '%s: %s tokens in %s ms',
              type(atomizer).__name__, humanize.Commas(atomizer.vocab_size),
              humanize.Commas(int((time.time() - start_time) * 1000)))
      self.encoded.Create(self.preprocessed, atomizer,
                          self.config.contentfile_separator)
示例#4
0
    def Import(
        self,
        session: sqlutil.Session,
        preprocessed_db: preprocessed.PreprocessedContentFiles,
        atomizer: atomizers.AtomizerBase,
        contentfile_separator: str,
    ) -> None:
        with preprocessed_db.Session() as p_session:
            query = p_session.query(
                preprocessed.PreprocessedContentFile).filter(
                    preprocessed.PreprocessedContentFile.
                    preprocessing_succeeded == True,
                    ~preprocessed.PreprocessedContentFile.id.in_(
                        session.query(EncodedContentFile.id).all()),
                )
            jobs = [
                internal_pb2.EncoderWorker(
                    id=x.id,
                    text=x.text,
                    contentfile_separator=contentfile_separator,
                    pickled_atomizer=pickle.dumps(atomizer),
                ) for x in query
            ]
            if not jobs:
                raise errors.EmptyCorpusException(
                    "Pre-processed corpus contains no files: "
                    f"'{preprocessed_db.url}'")

            app.Log(
                1,
                "Encoding %s of %s preprocessed files",
                humanize.Commas(query.count()),
                humanize.Commas(
                    p_session.query(
                        preprocessed.PreprocessedContentFile).filter(
                            preprocessed.PreprocessedContentFile.
                            preprocessing_succeeded == True).count()),
            )
            pool = multiprocessing.Pool()
            bar = progressbar.ProgressBar(max_value=len(jobs))
            last_commit = time.time()
            wall_time_start = time.time()
            for encoded_cf in bar(pool.imap_unordered(EncoderWorker, jobs)):
                wall_time_end = time.time()
                # TODO(cec): Remove the if check once EncoderWorker no longer returns
                # None on atomizer encode error.
                if encoded_cf:
                    encoded_cf.wall_time_ms = int(
                        (wall_time_end - wall_time_start) * 1000)
                    session.add(encoded_cf)
                wall_time_start = wall_time_end
                if wall_time_end - last_commit > 10:
                    session.commit()
                    last_commit = wall_time_end
示例#5
0
    def Create(self) -> None:
        """Create the corpus files.

    Raises:
      EmptyCorpusException: If there are no content files, or no successfully
        pre-processed files.
    """
        self._created = True
        app.Log(1, "Content ID: %s", self.content_id)

        # Nothing to do for already-encoded databases.
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after
        # splitting out Corpus class.
        if self.config.HasField("pre_encoded_corpus_url"):
            with self.dashboard_db.Session(commit=True) as session:
                config_to_store = corpus_pb2.Corpus()
                config_to_store.CopyFrom(self.config)
                # Clear the contentfiles field, since we use the content_id to uniquely
                # identify the input files. This means that corpuses with the same content
                # files delivered through different means (e.g. two separate but identical
                # directories) have the same hash.
                config_to_store.ClearField("contentfiles")
                corpus = session.GetOrAdd(
                    dashboard_db.Corpus,
                    config_proto_sha1=crypto.sha1(
                        config_to_store.SerializeToString()),
                    config_proto=str(config_to_store),
                    preprocessed_url="",
                    encoded_url=self.encoded.url,
                    summary=self.GetShortSummary(),
                )
                session.flush()
                self._dashboard_db_id = corpus.id
            return

        preprocessed_lock_path = (
            pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent /
            "LOCK")
        with lockfile.LockFile(preprocessed_lock_path):
            self.preprocessed.Create(self.config)
        if not self.preprocessed.size:
            raise errors.EmptyCorpusException(
                f"Pre-processed corpus contains no files: '{self.preprocessed.url}'"
            )
        encoded_lock_path = (
            pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "LOCK")
        with lockfile.LockFile(encoded_lock_path):
            start_time = time.time()
            atomizer = self.atomizer
            app.Log(
                1,
                "%s: %s tokens in %s ms",
                type(atomizer).__name__,
                humanize.Commas(atomizer.vocab_size),
                humanize.Commas(int((time.time() - start_time) * 1000)),
            )
            self.encoded.Create(self.preprocessed, atomizer,
                                self.config.contentfile_separator)

        # Add entry to dashboard database
        with self.dashboard_db.Session(commit=True) as session:
            config_to_store = corpus_pb2.Corpus()
            config_to_store.CopyFrom(self.config)
            # Clear the contentfiles field, since we use the content_id to uniquely
            # identify the input files. This means that corpuses with the same content
            # files delivered through different means (e.g. two separate but identical
            # directories) have the same hash.
            config_to_store.ClearField("contentfiles")
            corpus = session.GetOrAdd(
                dashboard_db.Corpus,
                config_proto_sha1=crypto.sha1(
                    config_to_store.SerializeToString()),
                config_proto=str(config_to_store),
                preprocessed_url=self.preprocessed.url,
                encoded_url=self.encoded.url,
                summary=self.GetShortSummary(),
            )
            session.flush()
            self._dashboard_db_id = corpus.id