Пример #1
0
    def __init__(
        self,
        parse: bool = True,
        rerank_model: PathLike = None,
        asp_thresh: int = 3,
        op_thresh: int = 2,
        max_iter: int = 3,
    ):
        self.acquire_lexicon = AcquireTerms(asp_thresh, op_thresh, max_iter)
        if parse:
            from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

            self.parser = SpacyBISTParser()
        else:
            self.parser = None

        if not rerank_model:
            print("using pre-trained reranking model")
            rerank_model = _download_pretrained_rerank_model(
                RERANK_MODEL_DEFAULT_PATH)

        download_unzip(*EMBEDDING_URL,
                       EMBEDDING_PATH,
                       license_msg="Glove word embeddings.")
        self.rerank = RerankTerms(vector_cache=True,
                                  rerank_model=rerank_model,
                                  emb_model_path=EMBEDDING_PATH)
Пример #2
0
class TrainSentiment:
    def __init__(self,
                 parse: bool = True,
                 rerank_model: PathLike = None,
                 asp_thresh: int = 3,
                 op_thresh: int = 2,
                 max_iter: int = 3):
        self.acquire_lexicon = AcquireTerms(asp_thresh, op_thresh, max_iter)
        if parse:
            from nlp_architect.pipelines.spacy_bist import SpacyBISTParser
            self.parser = SpacyBISTParser()
        else:
            self.parser = None

        if not rerank_model:
            print('using pre-trained reranking model')
            rerank_model = _download_pretrained_rerank_model(
                RERANK_MODEL_DEFAULT_PATH)

        download_unzip(*EMBEDDING_URL,
                       EMBEDDING_PATH,
                       license_msg="Glove word embeddings.")
        self.rerank = RerankTerms(vector_cache=True,
                                  rerank_model=rerank_model,
                                  emb_model_path=EMBEDDING_PATH)

    def run(self,
            data: Union[str, PathLike] = None,
            parsed_data: Union[str, PathLike] = None,
            out_dir: Union[str, PathLike] = TRAIN_OUT):

        if not parsed_data:
            if not self.parser:
                raise RuntimeError(
                    "Parser not initialized (try parse=True at init)")
            parsed_dir = Path(out_dir) / 'parsed' / Path(data).stem
            parsed_data = self.parse_data(data, parsed_dir)

        generated_aspect_lex = self.acquire_lexicon.acquire_lexicons(
            parsed_data)
        _write_aspect_lex(parsed_data, generated_aspect_lex, LEXICONS_OUT)

        generated_opinion_lex_reranked = \
            self.rerank.predict(AcquireTerms.acquired_opinion_terms_path,
                                AcquireTerms.generic_opinion_lex_path)
        _write_opinion_lex(parsed_data, generated_opinion_lex_reranked,
                           LEXICONS_OUT)

        return generated_opinion_lex_reranked, generated_aspect_lex

    def parse_data(self, data: PathLike or PosixPath, parsed_dir: PathLike
                   or PosixPath):
        _, data_size = parse_docs(self.parser, data, out_dir=parsed_dir)
        if data_size < 1000:
            raise ValueError(
                'The data contains only {0} sentences. A minimum of 1000 '
                'sentences is required for training.'.format(data_size))
        return parsed_dir