def __init__( self, stopwords_folder_path: AnyStr = None, use_models: bool = False, hashtags_as_token: bool = True, batch_size: int = DEFAULT_BATCH_SIZE, max_num_characters: int = MAX_NUM_CHARACTERS, ): """Initialization method for the MultilingualTokenizer class, with optional arguments Args: stopwords_folder_path (str, optional): Path to a folder with stopword text files (one line per stopword) Files should be named "{language_code}.txt" with the code in ISO 639-1 format use_models (bool): If True (default), loads spaCy models, which is slower but allows to retrieve Part-of-Speech and Entities tags for downstream tasks hashtags_as_token (bool): Treat hashtags as one token instead of two Default is True, which overrides the spaCy default behavior batch_size (int): Number of documents to process in spaCy pipelines Default is set by the DEFAULT_BATCH_SIZE class constant max_num_characters (int): Maximum number of characters in a single text Default is 10 million, higher than spaCy more conservative default at 1 million """ store_attr() self.spacy_nlp_dict = {} self.tokenized_column = None # may be changed by tokenize_df
def __init__(self, emailer, messenger, items, confirms=1): """Base class for scraping. Args: emailer (Emailer): emailer to use for alerts messenger (Messenger): messenger to use for alerts items (list): list of item descriptions confirms (int): number of repeating states for a state change """ super().__init__() store_attr() self.id = str(uuid4())[-12:] self.options = Options() self.options.headless = True self.options.add_argument("start-maximized") # self.profile = FirefoxProfile() # self.profile.set_preference("dom.disable_beforeunload", True) # self.profile.set_preference("browser.tabs.warnOnClose", False) self.driver = None self.waiter = None self.stock_state = { x["name"]: { "current_state": None, "pending_state": [None for _ in range(self.confirms)], "excluded": x["exclude"] } for x in self.items }
def __init__( self, tokenizer: MultilingualTokenizer, token_filters: Set[AnyStr], lemmatization: bool = True, lowercase: bool = True, unicode_normalization: UnicodeNormalization = UnicodeNormalization. NONE, keep_filtered_tokens: bool = False, ): """Initialization method for the TextCleaner class, with optional arguments Args: tokenizer (MultilingualTokenizer): Tokenizer instance to handle the initial tokenization step token_filters (set): Set of spaCy token attributes to filter out Available token filters are defined in MultilingualTokenizer.DEFAULT_FILTER_TOKEN_ATTRIBUTES lemmatization (bool, optional): If True, lemmatize tokens using spaCy lookups data Default is True, which simplifies all tokens to their lemma e.g. going -> go, mice -> mouse lowercase (bool, optional): If True, convert everything to lowercase after filter and lemmatization steps Default is True unicode_normalization (UnicodeNormalization, optional): Unicode normalization method (final post-processing) Default is not to apply normalization. Beware that it's a more complex topic than it looks. Read https://en.wikipedia.org/wiki/Unicode_equivalence if you want to understand more TL;DR: human languages are a mess => Unicode is a mess too keep_filtered_tokens (bool): If True, store filtered tokens in additional columns in the output dataframe Default is False, adding only 1 column, which is the cleaned version of the original text """ store_attr() self.output_column_descriptions = ( self.OUTPUT_COLUMN_DESCRIPTIONS.copy() ) # will be changed by `_prepare_df_for_cleaning`
def __init__( self, tokenizer: MultilingualTokenizer, text_column: AnyStr, font_folder_path: AnyStr, language: AnyStr = "en", language_column: AnyStr = None, subchart_column: AnyStr = None, max_words: int = DEFAULT_MAX_WORDS, color_list: List = DEFAULT_COLOR_LIST, font: str = DEFAULT_FONT, scale: float = DEFAULT_SCALE, margin: float = DEFAULT_MARGIN, random_state: int = DEFAULT_RANDOM_STATE, figsize: tuple = DEFAULT_FIGSIZE, dpi: int = DEFAULT_DPI, titlepad: int = DEFAULT_TITLEPAD, titlesize: int = DEFAULT_TITLESIZE, pad_inches: int = DEFAULT_PAD_INCHES, bbox_inches: str = DEFAULT_BBOX_INCHES, background_color: str = DEFAULT_BACKGROUND_COLOR, ): """Initialization method for the WordcloudVisualizer class, with optional arguments etailed above""" store_attr() random.seed(self.random_state) self.language_as_subchart = self.language_column == self.subchart_column if self.subchart_column == "order66": self.font = "DeathStar.otf" self.subchart_column = None
def __init__( self, ontology_df: pd.DataFrame, tag_column: AnyStr, category_column: AnyStr, keyword_column: AnyStr, language: AnyStr, lemmatization: bool = False, ignore_case: bool = False, ignore_diacritics: bool = False, ): store_attr() self._remove_incomplete_rows() if self.category_column: self._replace_missing_categories() # set the punctuation characters to use for sentence splitting config = { "sentencizer": { "punct_chars": Sentencizer.default_punct_chars + ["\n"] } } self.tokenizer = MultilingualTokenizer( add_pipe_components=["sentencizer"], enable_pipe_components="sentencizer", config=config, ) self._matcher_dict = {} # Dictionary of spaCy PhraseMatcher objects filled by the _match_no_category method. # Unused if we are using EntityRuler (in case there are categories in the Ontology) self.column_descriptions = {} # Dictionary of new columns to add in the dataframe (key) and their descriptions (value). # It is filled by the _format_with_category / _format_no_category methods self._use_nfc = self.lemmatization and not self.ignore_diacritics # Text will be normalized with NFC if True, with NFD otherwise. self._keyword_to_tag = {}
def __init__(self, items, tfms, use_list=None, do_setup=True, split_idx=None, train_setup=True, splits=None, types=None, verbose=False, dl_type=None): super().__init__(items, use_list=use_list) if dl_type is not None: self._dl_type = dl_type #potentially unused self.splits = L([slice(None), []] if splits is None else splits).map( mask2idxs) if isinstance(tfms, TfmdListsX): tfms = tfms.tfms if isinstance(tfms, PipelineX): do_setup = False # This is relevant, equivalent to PipelineX self.tfms = PipelineX(tfms, split_idx=split_idx) store_attr('types,split_idx') if do_setup: pv(f"Setting up {self.tfms}", verbose) self.setup(train_setup=train_setup)
def __init__( self, api_wrapper: GoogleCloudVisionAPIWrapper, input_folder: dataiku.Folder, input_df: pd.DataFrame, column_prefix: AnyStr = "api", input_folder_is_gcs: bool = False, input_folder_bucket: AnyStr = "", input_folder_root_path: AnyStr = "", output_dataset: dataiku.Dataset = None, output_folder: dataiku.Folder = None, output_folder_is_gcs: bool = False, output_folder_bucket: AnyStr = "", output_folder_root_path: AnyStr = "", api_quota_rate_limit: int = 1800, api_quota_period: int = 60, batch_support: bool = False, batch_size: int = 4, parallel_workers: int = 4, error_handling: ErrorHandling = ErrorHandling.LOG, features: List[Dict] = [{}], max_results: int = 10, image_context: Dict = {}, minimum_score: float = 0.0, content_categories: List[vision.Feature.Type] = [], unsafe_content_categories: List[UnsafeContentCategory] = [], **kwargs, ): store_attr()
def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token=1, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5, bidir=False): store_attr('emb_sz,n_hid,n_layers,pad_token') self.bs = 1 self.n_dir = 2 if bidir else 1 self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token) self.encoder_dp = EmbeddingDropout(self.encoder, embed_p) self.rnns = nn.ModuleList([ self._one_rnn(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz) // self.n_dir, bidir, weight_p, l) for l in range(n_layers) ]) self.encoder.weight.data.uniform_(-self.initrange, self.initrange) self.input_dp = RNNDropout(input_p) self.hidden_dps = nn.ModuleList( [RNNDropout(hidden_p) for l in range(n_layers)]) self.reset()
def __init__(self, max_retries=3): """Determine email timing values. Args: max_retries (int): maximum action attempts before terminating """ store_attr()
def __init__( self, input_table: str, mv_output: str, taxa_output: str, core_model: str = "en_core_sci_sm", ) -> None: store_attr("input_table, mv_output, taxa_output, core_model")
def __init__(self, email: str, archive_paths: str, extracted_output: str, query: str = 'mud[TIAB] AND volcano[TIAB]') -> None: store_attr('email, query') self.archive_paths = Path(archive_paths).glob('**/*.gz') self.extracted_output = Path(extracted_output)
def __init__(self, in_channels=1, n_classes=2, stride=1, inplanes=64, pre_ssl=True, **kwargs): super().__init__() store_attr('in_channels, n_classes, inplanes, pre_ssl') #encoder if pre_ssl: m = torch.hub.load( 'facebookresearch/semi-supervised-ImageNet1K-models', 'resnext50_32x4d_ssl') else: m = ResNet(Bottleneck, [3, 4, 6, 3], groups=32, width_per_group=4) m.conv1.padding = (0, 0) if in_channels < 3: #print('Cutting input layer weights to', in_channels, 'channel(s).') with torch.no_grad(): m.conv1.weight = nn.Parameter(m.conv1.weight[:, :in_channels, ...]) elif in_channels > 3: m.conv1 = nn.Conv2d(in_channels, self.inplanes, kernel_size=7, stride=2, bias=False) #self.bn1 = m.bn1 if in_channels==3 else nn.BatchNorm2d(self.inplanes) self.enc0 = nn.Sequential(m.conv1, m.bn1, nn.ReLU(inplace=True)) self.enc1 = nn.Sequential( nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1), m.layer1) #256 self.enc2 = m.layer2 #512 self.enc3 = m.layer3 #1024 self.enc4 = m.layer4 #2048 #aspp with customized dilatations self.aspp = ASPP( 2048, 256, out_c=512, dilations=[stride * 1, stride * 2, stride * 3, stride * 4]) self.drop_aspp = nn.Dropout2d(0.5) #decoder self.dec4 = UnetBlock(512, 1024, 256, padding=0) self.dec3 = UnetBlock(256, 512, 128, padding=0) self.dec2 = UnetBlock(128, 256, 64, padding=0) self.dec1 = UnetBlock(64, 64, 32, padding=0) self.fpn = FPN([512, 256, 128, 64], [16] * 4) self.drop = nn.Dropout2d(0.1) self.final_conv = ConvLayer(32 + 16 * 4, n_classes, ks=1, norm_type=None, act_cls=None)
def __init__( self, text_df, text_column, tokenizer, language, language_column=None, ): store_attr()
def __init__(self, path = Path('.'), data_fn='data', model_fn='model', data_func=None, bs=16, cpu=False, onnx=False): data = load_data(path, data_fn) self.n_inp = data['n_inp'] self.pipelines = make_pipelines(data) self.after_item = self.pipelines['after_item'] self.after_batch = self.pipelines['after_batch'] self.tfm_y = generate_pipeline(data['tfms'], order=False) self.model = load_model(path, model_fn, cpu, onnx) self.device = 'cpu' if cpu else 'cuda' store_attr(self, 'data_func,bs') self.decode_func = None
def __init__( self, gcp_service_account_key: AnyStr = None, gcp_continent: AnyStr = None, api_quota_period: int = 60, api_quota_rate_limit: int = 1800, ): store_attr() self.client = self.get_client() self.call_api_annotate_image = self._build_call_api_annotate_image() self.call_api_document_text_detection = self._build_call_api_document_text_detection()
def __init__( self, stopwords_folder_path: Optional[AnyStr] = None, use_models: bool = False, hashtags_as_token: bool = True, batch_size: int = DEFAULT_BATCH_SIZE, max_num_characters: int = MAX_NUM_CHARACTERS, add_pipe_components: List[str] = [], enable_pipe_components: Optional[Union[List[str], str]] = None, disable_pipe_components: Optional[Union[List[str], str]] = None, config: dict = {}, ): """Initialization method for the MultilingualTokenizer class, with optional arguments Args: stopwords_folder_path (str, optional): Path to a folder with stopword text files (one line per stopword). Files should be named "{language_code}.txt" with the code in ISO 639-1 format. use_models (bool): If True, loads spaCy models, which is slower but allows to retrieve Part-of-Speech and Entities tags for downstream tasks. Default is False. hashtags_as_token (bool): Treat hashtags as one token instead of two. Default is True, which overrides the spaCy default behavior. batch_size (int): Number of documents to process in spaCy pipelines. Default is set by the DEFAULT_BATCH_SIZE class constant. max_num_characters (int): Maximum number of characters in a single text. Default is 10 million, higher than spaCy more conservative default at 1 million. add_pipe_components (list): List of spaCy pipeline components to add, for instance "sentencizer". If use_models is False, only the tokenizer component is present so other components must be added explicitly. If use_models is True, several pipeline components are automatically added. Please refer to the spaCy documentation to know which components are available for each model. enable_pipe_components (list, optional): List of spaCy pipeline components to enable. To enable components, they must be added first, either by activating use_models or by adding them explicitly in add_pipe_components. disable_pipe_components (list, optional): List of spaCy pipeline components to disable. To disable components, they must be added first, either by activating use_models or by adding them explicitly in add_pipe_components. Please use either enable_pipe_components or disable_pipe_components, as both cannot be used at the same time. config (dict): Dictionary for SpaCy component(key) and its associated SpaCy.Language.config dictionary (value) This config dictionary contains metadatas about the component. If empty, uses SpaCy default config, describing the default values of the factory arguments """ store_attr() self.spacy_nlp_dict = {} self.tokenized_column = None # may be changed by tokenize_df self._restore_pipe_components = {} """spacy.language.DisabledPipes object initialized in create_spacy_tokenizer() Contains the components of each SpaCy.Language object that have been disabled by spacy.Languages.select_pipes() method. Those components can be re-added to each SpaCy.Language at their initial place in the pipeline, by calling restore_pipe_components[language].restore() """ if self.enable_pipe_components and self.disable_pipe_components: raise ValueError( "Only one of enable_pipe_components and disable_pipe_components can be specified at once." )
def __init__(self, tok, rules=None, counter=None, lengths=None, mode=None, sep=' '): if isinstance(tok, type): tok = tok() store_attr('tok,counter,lengths,mode,sep') self.rules = defaults.text_proc_rules if rules is None else rules print(self.rules) print(tok)
def __init__(self, sender, account_id, auth_token): """Sends SMS. Args: sender (str): sender number account_id (str): twilio account id auth_token (str): twilio auth token """ store_attr() self.client = Client(account_id, auth_token)
def __init__(self, emailer_configs, messenger_configs, database): """Factory class for creating specific scrapers. Args: emailer_configs (dict): configs for email sender messenger_configs (dict): configs for sms sender database (Database): database of items and subscribers """ store_attr() self.scrapers_classes = [x for x in Scraper.__subclasses__()]
def __init__(self, server, port, sender, sender_pass, recipient=None): """Sends emails. Args: server (str): email server port (str): email server port sender (str): sender email address sender_pass (str): sender email password recipient (list): default email recipients """ super().__init__() store_attr()
def __init__(self, items_db_file, subs_db_file): """Access databases for items and subscriptions. Args: items_db_file (str): file path for database of items subs_db_file (str): file path for database of subscribers """ store_attr() with open(file=self.items_db_file, mode="r") as f: self.items_db = load(fp=f) with open(file=self.subs_db_file, mode="r") as f: self.subs_db = load(fp=f)
def __init__(self, site_load_time=5, poll_time=2, max_refreshes=10, max_wait_time=5): """Determine scrape timing values. Args: site_load_time (int): wait time for a site to load before scraping poll_time (int): wait time between scraping a site max_refreshes (int): maximum site refreshes before reconnecting max_wait_time (int): maximum wait time for a site element to be found during scraping """ store_attr()
def __init__( self, language_scope: List = SUPPORTED_LANGUAGES_PYCLD3.keys(), minimum_score: float = 0.0, fallback_language: AnyStr = "", ): store_attr() self.column_descriptions = self.COLUMN_DESCRIPTIONS.copy( ) # may be changed by detect_languages_df self._langid_identifier = LanguageIdentifier.from_modelstring( model, norm_probs=True) self._langid_identifier.set_languages([ l for l in self.language_scope if l not in SUPPORTED_LANGUAGES_PYCLD3_NOT_LANGID ])
def __init__( self, input_df: pd.DataFrame, input_folder: dataiku.Folder = None, column_prefix: AnyStr = "api", error_handling: ErrorHandling = ErrorHandling.LOG, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, **kwargs, ): store_attr() self.output_df = None # initialization before calling format_df self.api_column_names = build_unique_column_names( input_df.keys(), column_prefix) self.column_description_dict = { column_name: API_COLUMN_NAMES_DESCRIPTION_DICT[key] for key, column_name in self.api_column_names._asdict().items() } self.column_description_dict[ PATH_COLUMN] = "Path of the file relative to the input folder"
def __init__( self, tokenizer: MultilingualTokenizer, dictionary_folder_path: AnyStr, custom_vocabulary_set: Set[AnyStr] = set(), custom_corrections: Dict = {}, edit_distance: int = DEFAULT_EDIT_DISTANCE, ignore_token: Pattern = None, transfer_casing: bool = True, compute_diagnosis: bool = True, ): """Initialization method for the SpellChecker class, with optional arguments Args: dictionary_folder_path: Local path to a folder containing SymSpell dictionary files Each dictionary file in the folder should be named "xx.txt" where xx is the language code in ISO 639-1 format custom_vocabulary_set: Optional - Set of words that should not be corrected custom_corrections: Optional - Dictionary of words (key) and their custom correction (value) edit_distance: Maximum edit distance between a word and its correction. Default is 2, which is SymSpell recommendation for reasonable speed and quality ignore_token: Regular expression for words not to be corrected Should be a compiled regex object, use re.compile beforehand transfer_casing (bool): If True, transfer input word case to the corrected word Default is True, which works well for European languages compute_diagnosis (bool): If True, compute spellchecker diagnosis of each word Adds ~20% processing time but allows to understand what the spellchecker did """ store_attr() self._symspell_checker_dict = {} self.output_column_descriptions = ( self.OUTPUT_COLUMN_DESCRIPTIONS.copy() ) # may be changed by `_prepare_df_for_spellchecker` if self.compute_diagnosis: self._diagnosis_lock = Lock() self._token_dict = { k: Counter() for k in SUPPORTED_LANGUAGES_SYMSPELL } # may be changed by check_token self._diagnosis_list = [] # may be changed by check_token
def __init__( self, language: AnyStr, tokenizer: MultilingualTokenizer, category_column: AnyStr, ignore_case: bool, lemmatization: bool, ignore_diacritics: bool, text_column_tokenized: AnyStr, _use_nfc: bool, tag_columns: List[AnyStr], _keyword_to_tag: dict = None, _matcher_dict: dict = None, ): store_attr() self.output_df = ( pd.DataFrame() ) # pandas.DataFrame with new columns concerning the found tags tqdm.pandas(miniters=1, mininterval=5.0) self.column_descriptions = {} """Dictionary of new columns to add in the dataframe (key) and their descriptions (value)
def __init__( self, model_name: str, batch_size: int = 8, attention_probs_dropout_prob: float = 0.4, learning_rate: float = 5e-7, adam_epsilon: float = 1e-8, hidden_dropout_prob: float = 0.3, epochs: int = 3, lm_model_dir: str = None, wname=None, drivepath="../drive/My\ Drive/HinglishNLP/repro", ): store_attr() self.timestamp = str(datetime.now().strftime("%d.%m.%y")) if not self.wname: self.wname = self.model_name wandb.init( project="hinglish", config={ "model_name": self.model_name, "batch_size": self.batch_size, "attention_probs_dropout_prob": self.attention_probs_dropout_prob, "learning_rate": self.learning_rate, "adam_epsilon": self.adam_epsilon, "hidden_dropout_prob": self.hidden_dropout_prob, "epochs": self.epochs, }, name=f"{self.wname} {self.timestamp}", ) print({"Model Info": f"Setup self.model training for {model_name}"}) self.device = check_for_gpu(self.model_name) if not lm_model_dir: if self.model_name == "bert": self.lm_model_dir = "model_save" elif self.model_name == "distilbert": self.lm_model_dir = "distilBert6" elif self.model_name == "roberta": self.lm_model_dir = "roberta3"
def __init__( self, content_categories: List[vision.Feature.Type], minimum_score: float = 0, max_results: int = 10, **kwargs, ): store_attr() self._compute_column_description()
def __init__(self, data: Dict[str, Any], timeline: Dict[str, Any]): store_attr()
def __init__(self, vocab=None, sort=True, add_na=False): if vocab is not None: vocab = CategoryMap(vocab, sort=sort, add_na=add_na) store_attr()