Пример #1
0
def get_spacy_model(spacy_model_name: str,
                    pos_tags: bool,
                    parse: bool,
                    ner: bool,
                    with_custom_tokenizer: bool = False,
                    with_sentence_segmenter: bool = False) -> SpacyModelType:
    """
    In order to avoid loading spacy models repeatedly,
    we'll save references to them, keyed by the options
    we used to create the spacy model, so any particular
    configuration only gets loaded once.
    """
    options = (spacy_model_name, pos_tags, parse, ner, with_custom_tokenizer, with_sentence_segmenter)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            print(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        if with_custom_tokenizer:
            spacy_model.tokenizer = combined_rule_tokenizer(spacy_model)
        if with_sentence_segmenter:
            spacy_model.add_pipe(combined_rule_sentence_segmenter, first=True)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Пример #2
0
    def train(self):
        with self.snapshot.training_lock():
            spacy_model_name = os.environ.get('NERD_SPACY_MODEL')
            with log_perf(f'{self.snapshot} TRAINING'):
                try:
                    self._nlp = spacy.load(spacy_model_name)
                except OSError:
                    logger.warning(
                        f"Spacy model '{spacy_model_name}' not found.  Downloading and installing."
                    )
                    from spacy.cli.download import download as spacy_download
                    spacy_download(spacy_model_name)
                    from spacy.cli import link
                    from spacy.util import get_package_path

                    package_path = get_package_path(spacy_model_name)
                    link(spacy_model_name,
                         spacy_model_name,
                         force=True,
                         package_path=package_path)
                    self._nlp = spacy.load(spacy_model_name)
                self._add_types()
                self._train_snapshot_texts()
                """ Only locking when saving to disk after training is done in memory """
            with log_perf(f'{self.snapshot} SAVING_TO_DISK'):
                if os.path.exists(self._path):
                    shutil.rmtree(self._path)
                self._nlp.to_disk(self._path)
Пример #3
0
 def load_lang_model(lang: str, disable: List[str]):
     """Load spaCy language model or download if
         model is available and not installed
     
     Arguments:
         lang {str} -- language
         disable {List[str]} -- If only using tokenizer, can disable ['parser', 'ner', 'textcat']
     
     Returns:
         [type] -- [description]
     """
     if 'coref' in lang:
         try:
             return spacy.load(lang, disable=disable)  #
         except Exception as e:
             return SpacyAnnotator.load_lang_model(lang.split('_')[0],
                                                   disable=disable)
     try:
         return spacy.load(lang, disable=disable)
     except OSError:
         logger.warning(
             f"Spacy models '{lang}' not found.  Downloading and installing."
         )
         spacy_download(lang)
         # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy
         # 2.1.0, which removed the linking that was done in spacy 2.0.  importlib doesn't find
         # packages that were installed in the same python session, so the way `spacy_download`
         # works in 2.1.0 is broken for this use case.  These four lines can probably be removed
         # at some point in the future, once spacy has figured out a better way to handle this.
         # See https://github.com/explosion/spaCy/issues/3435.
         from spacy.cli import link
         from spacy.util import get_package_path
         package_path = get_package_path(lang)
         link(lang, lang, model_path=package_path)
         return spacy.load(lang, disable=disable)
Пример #4
0
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            # NOTE(mattg): The following four lines are a workaround suggested by Ines for spacy
            # 2.1.0, which removed the linking that was done in spacy 2.0.  importlib doesn't find
            # packages that were installed in the same python session, so the way `spacy_download`
            # works in 2.1.0 is broken for this use case.  These four lines can probably be removed
            # at some point in the future, once spacy has figured out a better way to handle this.
            # See https://github.com/explosion/spaCy/issues/3435.
            from spacy.cli import link
            from spacy.util import get_package_path
            package_path = get_package_path(spacy_model_name)
            link(spacy_model_name, spacy_model_name, model_path=package_path)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Пример #5
0
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Пример #6
0
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool,
                    ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ["vectors", "textcat"]
        if not pos_tags:
            disable.append("tagger")
        if not parse:
            disable.append("parser")
        if not ner:
            disable.append("ner")
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(
                f"Spacy models '{spacy_model_name}' not found.  Downloading and installing."
            )
            spacy_download(spacy_model_name)

            # Import the downloaded model module directly and load from there
            spacy_model_module = __import__(spacy_model_name)
            spacy_model = spacy_model_module.load(
                disable=disable)  # type: ignore

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Пример #7
0
 def __init__(
     self,
     model="en",
     disable=None,
     display_prompt=True,
     n_jobs=8,
     batch_size=1500,
     spacy_doc=False,
     show_tok=True,
     show_doc=True,
     ptb_pos=False,
 ):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = "https://spacy.io/models"
         if display_prompt and license_prompt("Spacy {} model".format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         print("Spacy model installed, please rerun your command.")
         sys.exit(0)
     self.n_jobs = n_jobs
     self.batch_size = batch_size
     self.spacy_doc = spacy_doc
     self.show_tok = show_tok
     self.show_doc = show_doc
     self.ptb_pos = ptb_pos
Пример #8
0
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool,
                    ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(
                f"Spacy models '{spacy_model_name}' not found.  Downloading and installing."
            )
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Пример #9
0
def setup_model(model):
    try:
        nlp = spacy.load(model)
    except OSError:
        print(f"Spacy model '{model}' not found.  Downloading and installing.")
        spacy_download(model)
        nlp = spacy.load(model)
    return nlp
Пример #10
0
def get_spacy_model(spacy_model_name='en_core_web_sm'):
    try:
        nlp = spacy.load(spacy_model_name)
    except OSError:
        log.info('The %s model was not found. Loading "en_core_web_sm"...')
        spacy_download('en_core_web_sm')
        nlp = spacy.load('en_core_web_sm')
    return nlp
Пример #11
0
def load_spacy(model_name):
    try:
        model = spacy.load(model_name)
    except OSError:
        print(f"Spacy models '{model_name}' not found.  Downloading and installing.")
        spacy_download(model_name)
        model = spacy.load(model_name)
    return model
Пример #12
0
def setup_model():
    global NLP
    global MODEL
    try:
        NLP = spacy.load(MODEL)
    except OSError:
        print(
            f"Spacy models '{MODEL}' not found.  Downloading and installing.")
        spacy_download(MODEL)
        NLP = spacy.load(MODEL)
def download_models():
    print('Downloading models...')
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    ["python", "-m", "spacy", "download", "en"]
    print('...done downloading.\nImporting and loading model.')
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    print(nlp)

    spacy_download('en')
Пример #14
0
 def __init__(self, model="en", disable=None, display_prompt=True):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = "https://spacy.io/models"
         if display_prompt and license_prompt("Spacy {} model".format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         self._parser = spacy.load(model, disable=disable)
Пример #15
0
 def __init__(self, model='en', disable=None):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = 'https://spacy.io/models'
         if license_prompt('Spacy {} model'.format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         self._parser = spacy.load(model, disable=disable)
Пример #16
0
 def __init__(self, model='en', disable=None):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = 'https://spacy.io/models'
         if license_prompt('Spacy {} model'.format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         self._parser = spacy.load(model, disable=disable)
Пример #17
0
def spacy_downloader(spacy_model_name: str, pos_tags: bool, parse: bool,
                     ner: bool) -> SpacyModelType:
    '''
    This is a copy of allennlp.common.util.get_spacy_model function. This in  
    affect downloads the relevant spacy model and loads the model with the  
    relevant taggers e.g. POS, Parse and NER taggers for that spacy model which  
    is language dependent.

    Spacy can have multiple trained models per language based on size.

    :param spacy_model_name: Name of the Spacy model e.g. en_core_web_sm
    :param pos_tags: Whether or not the returned Spacy model should perform 
                     POS tagging.
    :param parse: Whether or not the returned Spacy model should perform 
                  Parsing.
    :param ner: Whether or not the returned Spacy model should perform 
                  NER.
    :returns: The relevant Spacy model.
    '''

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        # This needs manually updating each time Spacy is updated. Supported
        # languages can be found here: https://spacy.io/usage/models
        supported_codes = [
            'de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pt', 'xx'
        ]
        lang_code = spacy_model_name[:2]
        if lang_code not in supported_codes:
            raise ValueError('Spacy does not support the following language '
                             f'{lang_code}. These languages are supported '
                             f'{supported_codes}')

        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            print(f"Spacy models '{spacy_model_name}' not found. "
                  "Downloading and installing.")
            spacy_download(spacy_model_name)
            from spacy.cli import link
            from spacy.util import get_package_path
            package_path = get_package_path(spacy_model_name)
            link(spacy_model_name, spacy_model_name, model_path=package_path)
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Пример #18
0
    def _prepare_tests():
        import spacy
        from spacy.cli.download import download as spacy_download
        try:
            spacy.load('en')
        except OSError:
            spacy_download('en')

        from nlp_architect.api.machine_comprehension_api import MachineComprehensionApi
        from nlp_architect.api.intent_extraction_api import IntentExtractionApi
        from nlp_architect.api.ner_api import NerApi
        NerApi(prompt=False)
        IntentExtractionApi(prompt=False)
        MachineComprehensionApi(prompt=False).download_model()
def init():
    global inference
    spacy_download('en')
    aspect_lex_path = Model.get_model_path('c_aspect_lex')
    opinion_lex_path = Model.get_model_path('c_opinion_lex')
    print("%------------------------------------------%")
    print("aspect_lex_path: ", Path(aspect_lex_path))
    print("current wd: ", os.getcwd())
    path = Path(aspect_lex_path)
    print("pathlib-exists()---->", path.exists())
    print("Path :", path)
    print("Parent :", Path(aspect_lex_path).parent.parent.parent)
    print(os.listdir(Path(aspect_lex_path).parent.parent.parent))
    print("%-----------------------------------------%")
    inference = SentimentInference(aspect_lex_path, opinion_lex_path)
Пример #20
0
def select_spacy_model(spacy_model_name: srt) -> SpacyModelType:
    """
    This function checks if there is an instance from the Spacy Model
    specified. If there is, it returns the models. Otherwise, it loads
    the model. Loaded models are stored in LOADED_SPACY_MODELS
    """
    if spacy_model_name not in LOADED_SPACY_MODELS:
        try:
            spacy_model = spacy.load(spacy_model_name, disable=["ner"])
        except OSError:
            print(
                f"Spacy models '{spacy_model_name}' not found.  Downloading and installing."
            )
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=["ner"])
        LOADED_SPACY_MODELS[spacy_model_name] = spacy_model
    return LOADED_SPACY_MODELS[spacy_model_name]
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import json
import re
from tqdm import tqdm
from nltk import flatten
from nlp_architect.models.absa.inference.inference import SentimentInference
from spacy.cli.download import download as spacy_download
import spacy
from spacy.lang.en import English

# load english language model 
spacy_download('en')

# Construction via create_pipe
nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

# Custom func

def word_freq(word_list):
    """
    Return Polarity    
    """
    word_freq = [word_list.count(w) for w in word_list]
    return(dict(zip(word_list, word_freq)))
Пример #22
0
    def __init__(
        self,
        mode='all',
        config_file='multiwoz_all_context.json',
        model_file='https://convlab.blob.core.windows.net/convlab-2/bert_multiwoz_all_context.zip'
    ):
        assert mode == 'usr' or mode == 'sys' or mode == 'all'
        self.mode = mode
        config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   'configs/{}'.format(config_file))
        config = json.load(open(config_file))
        # print(config['DEVICE'])
        # DEVICE = config['DEVICE']
        DEVICE = 'cpu' if not torch.cuda.is_available() else 'cuda:0'
        root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        data_dir = os.path.join(root_dir, config['data_dir'])
        output_dir = os.path.join(root_dir, config['output_dir'])

        if not os.path.exists(os.path.join(data_dir, 'intent_vocab.json')):
            preprocess(mode)

        intent_vocab = json.load(
            open(os.path.join(data_dir, 'intent_vocab.json')))
        tag_vocab = json.load(open(os.path.join(data_dir, 'tag_vocab.json')))
        dataloader = Dataloader(
            intent_vocab=intent_vocab,
            tag_vocab=tag_vocab,
            pretrained_weights=config['model']['pretrained_weights'])

        print('intent num:', len(intent_vocab))
        print('tag num:', len(tag_vocab))

        best_model_path = os.path.join(output_dir, 'pytorch_model.bin')
        if not os.path.exists(best_model_path):
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print('Load from model_file param')
            archive_file = cached_path(model_file)
            archive = zipfile.ZipFile(archive_file, 'r')
            archive.extractall(root_dir)
            archive.close()
        print('Load from', best_model_path)
        model = JointBERT(config['model'], DEVICE, dataloader.tag_dim,
                          dataloader.intent_dim)
        model.load_state_dict(
            torch.load(os.path.join(output_dir, 'pytorch_model.bin'), DEVICE))
        model.to(DEVICE)
        model.eval()

        self.model = model
        self.use_context = config['model']['context']
        self.dataloader = dataloader
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except Exception:
            print('download en_core_web_sm for spacy')
            from spacy.cli.download import download as spacy_download
            spacy_download("en_core_web_sm")
            spacy_model_module = __import__("en_core_web_sm")
            self.nlp = spacy_model_module.load()
        with open(
                os.path.join(get_root_path(),
                             'data/multiwoz/db/postcode.json'), 'r') as f:
            token_list = json.load(f)

        for token in token_list:
            token = token.strip()
            self.nlp.tokenizer.add_special_case(token, [{
                ORTH: token,
                LEMMA: token,
                POS: u'NOUN'
            }])
        print("BERTNLU loaded")
Пример #23
0
def get_spacy_model(
    spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool
) -> SpacyModelType:

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ["vectors", "textcat"]
        if not pos_tags:
        if not parse:
            disable.append("parser")
        if not ner:
            disable.append("ner")
        try:
        except OSError:
            logger.warning(
                f"Spacy models '{spacy_model_name}' not found.  Downloading and installing."
            )
            spacy_download(spacy_model_name)

            spacy_model_module = __import__(spacy_model_name)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]


@contextmanager
def pushd(new_dir: PathType, verbose: bool = False) -> ContextManagerFunctionReturnType[None]:
    if verbose:
    try:
        yield
    finally:
        if verbose:
            logger.info(f"Changing directory back to {previous_dir}")
        os.chdir(previous_dir)


@contextmanager
def push_python_path(path: PathType) -> ContextManagerFunctionReturnType[None]:
    path = Path(path).resolve()
    path = str(path)
    try:
        yield
    finally:


def import_module_and_submodules(package_name: str) -> None:

    with push_python_path("."):
        path = getattr(module, "__path__", [])
        path_string = "" if not path else path[0]

        for module_finder, name, _ in pkgutil.walk_packages(path):
            if path_string and module_finder.path != path_string:
                continue
            subpackage = f"{package_name}.{name}"
            import_module_and_submodules(subpackage)


def peak_memory_mb() -> Dict[int, float]:
    if resource is None or sys.platform not in ("linux", "darwin"):
        peak_mb = 0.0
    else:
        if sys.platform == "darwin":
            peak_mb = peak / 1_000_000
        else:
            peak_mb = peak / 1_000

    if is_distributed():

        gather_results = [torch.tensor([0.0, 0.0]) for _ in range(world_size)]

        if dist.get_backend() == "nccl":
            gather_results = [x.cuda() for x in gather_results]


        results_dict: Dict[int, float] = {}
        for peak_mb_tensor in gather_results:
            worker = int(peak_mb_tensor[0])
            peak_mb = round(float(peak_mb_tensor[1]), 3)
            results_dict[worker] = peak_mb

        return results_dict
    else:
        return {0: peak_mb}


def gpu_memory_mb() -> Dict[int, int]:
    try:
        result = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"],
            encoding="utf-8",
        )
        gpu_memory = [int(x) for x in result.strip().split("\n")]
        return {gpu: memory for gpu, memory in enumerate(gpu_memory)}
    except FileNotFoundError:
        return {}
    except:  # noqa
        logger.warning(
            "unable to check gpu_memory_mb() due to occasional failure, continuing", exc_info=True
        )
        return {}


def ensure_list(iterable: Iterable[A]) -> List[A]:
    if isinstance(iterable, list):
        return iterable
    else:
        return list(iterable)


def is_lazy(iterable: Iterable[A]) -> bool:
    return not isinstance(iterable, list)


def int_to_device(device: Union[int, torch.device]) -> torch.device:
    if isinstance(device, torch.device):
        return device
    if device < 0:
    return torch.device(device)


def log_frozen_and_tunable_parameter_names(model: torch.nn.Module) -> None:
    frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(model)

    logger.info("The following parameters are Frozen (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)

    logger.info("The following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)


def get_frozen_and_tunable_parameter_names(
    model: torch.nn.Module,
) -> Tuple[Iterable[str], Iterable[str]]:
    frozen_parameter_names = (
    )
    tunable_parameter_names = (
        name for name, parameter in model.named_parameters() if parameter.requires_grad
    )
    return frozen_parameter_names, tunable_parameter_names


def dump_metrics(file_path: Optional[str], metrics: Dict[str, Any], log: bool = False) -> None:
    if file_path:
        with open(file_path, "w") as metrics_file:
    if log:
        logger.info("Metrics: %s", metrics_json)


def flatten_filename(file_path: str) -> str:


def is_master(
    global_rank: int = None, world_size: int = None, num_procs_per_node: int = None
) -> bool:

    if not is_distributed():
        return True

    if global_rank is None:
        global_rank = dist.get_rank()

    if world_size is None:
        world_size = dist.get_world_size()

    if num_procs_per_node is None and os.environ:
        num_procs_per_node = int(os.environ.get("ALLENNLP_PROCS_PER_NODE", world_size))

    return global_rank % (world_size / num_procs_per_node) == 0


def is_distributed() -> bool:


def sanitize_wordpiece(wordpiece: str) -> str:
        return wordpiece[2:]
    elif wordpiece.startswith("Ġ"):
        return wordpiece[1:]
    elif wordpiece.startswith("▁"):
Пример #24
0
import math
import logging
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import nltk
from nltk.tokenize import sent_tokenize
from spacy.cli.download import download as spacy_download
try:
    import en_core_web_sm
except ImportError:
    logging.warning(">Spacy  en_core_web_sm not found. Downloading and installing.")
    spacy_download("en_core_web_sm")
    import en_core_web_sm
from collections import Counter, defaultdict, OrderedDict
import time
import os
from enum import Enum, auto


class ParseAndModel:
    """
    Treats data input chain.
    Based on this data, computes matrices for reviews and features.
    Usage:
        pm = ParseAndModel(feature_list=["sound", "battery", ["screen", "display"]],
                       filename='../tests/data/parse_and_model/iPod.final')
        print(pm.model_results)
    """

    class InputType(Enum):
Пример #25
0
import nltk
import spacy
from nltk.tokenize import word_tokenize

try:
    # se não tiver na máquina faz o download
    import pt_core_news_sm  # noqa
except:  # noqa
    from spacy.cli.download import download as spacy_download

    spacy_download("pt_core_news_sm")
nltk.download("punkt")
sp = spacy.load("pt_core_news_sm")


def remove_portuguese_stopwords(text, custom_stopwords=None):
    text = text.lower()
    all_stopwords = sp.Defaults.stop_words
    abc = [char for char in "abcdefghijklmnopqrstuvxyzw"]
    if not custom_stopwords:
        custom_stopwords = []
    aditional_stopwords = list(all_stopwords) + abc + custom_stopwords

    text_tokens = word_tokenize(text)
    return " ".join(
        [word for word in text_tokens if word not in aditional_stopwords])
Пример #26
0
def default_nlp_model():  # pragma: no cover
    spacy_download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')
    return nlp