def _parse_diacritics(self, ch: str) -> str: """ EG: input with base a -> a/LENGTH/DIAERESIS/ :param ch: character :return: a string with separated and organized diacritics for easier access later. """ out = chars.base(ch).lower() # Initialize out as base of character. length = chars.length(ch) dia = chars.diaeresis(ch) out += "/" # Create 1st boundary # If any length, place between 1st and 2nd boundary if length: out += length out += "/" # Create 2nd boundary if dia: # If any diaeresis, out += dia # place between second and final boundary out += "/" # Create final boundary return out
def remove_accent(cls, txt: str) -> str: "remove accents from chars" txts: List[str] = [] for t in txt: tclean = base(t) txts.append(tclean) return "".join(txts)
def remove_accent(self, txt: str) -> str: """! \brief remove accents from ancient greek characters \code >>> Text.remove_accent(ἄρχω) >>> αρχω \endcode """ txts: List[str] = [] for t in txt: tclean = base(t) txts.append(tclean) return "".join(txts)
def train(self): # at this point we extract (custom) statistical features from the training dataset train_set = pd.read_csv('training dataset.csv', sep=';') for index, row in train_set.iterrows(): # firstly, we preprocess every training tweet # make tweet's text lowercase processed_tweet = row['Tweet'].lower() # remove urls, usernames and hashtags via RegularExpression processed_tweet = re.sub(r"(http\S+)|(@\S+)", "", processed_tweet) # remove punctuation processed_tweet = processed_tweet.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) # function that removes accent from a char buffer_str = '' for char in processed_tweet: unaccented_character = base(char) buffer_str = buffer_str + unaccented_character processed_tweet = buffer_str # remove some stantard abbreviations like 'rt', 'via', 'amp' processed_tweet = [ term for term in processed_tweet.split() if term not in stop ] processed_tweet = ' '.join(processed_tweet) processed_tweet = emoji_pattern.sub(u'', processed_tweet) # now, we extract the features from training tweets # 'b' gets every position of a search query (keyword) within a tweet # and stores them in feat_b list b = next((b + 1 for b, t in enumerate(processed_tweet.split()) if t in search_queries), 0) c = next((c + 1 for c, t in enumerate(processed_tweet.split()) if t in places_dict.keys()), 0) extracted_features = { 'feature_a': len(processed_tweet.split()), 'feature_b': b, 'feature_c': c } # we store our tweets' extracted features and their corresponding target values into lists self.train_attributes['features'].append(extracted_features) self.train_attributes['target'].append(row['Target'])
def _parse_diacritics(self, ch): # Returns a string with seperated and organized diacritics # for easier access later. # EG: input with base α -> α/ACCENT/ETC/ # (where ETC includes diaeresis, iota subscripts, and macrons) # Additions to greek_accentuation.characters for use here: marked_breathing = chars.extract_diacritic(chars.ROUGH) # (Don't need SMOOTH for these purposes) marked_accents = chars.extract_diacritic(chars.ACUTE, chars.CIRCUMFLEX) # (Don't need GRAVE for these purposes) marked_length = chars.extract_diacritic(chars.LONG) # (Don't need SHORT for these purposes) h = marked_breathing(ch) acc = marked_accents(ch) etc = [ chars.diaeresis(ch), chars.iota_subscript(ch), marked_length(ch) ] out = chars.base(ch).lower() # Initialize out as base of character. if h != None and out != "ρ": # If any rough breathing, and not rho out = "h///" + out # insert an h/// before the base. # ('aspirated' rhos can be ignored, # and dealt with seperately.) out += "/" # Create 1st boundary if acc != None: # If any accent, place between 1st and 2nd boundary out += acc out += "/" # Create 2nd boundary for c in [c for c in etc if c != None]: # If any other diacritics, out += c # place between second and final boundary out += "/" # Create final boundary return out
def _parse_diacritics(self, ch): # Returns a string with seperated and organized diacritics # for easier access later. # EG: input with base α -> α/ACCENT/ETC/ # (where ETC includes diaeresis, iota subscripts, and macrons) # Additions to greek_accentuation.characters for use here: marked_breathing = chars.extract_diacritic(chars.ROUGH) # (Don't need SMOOTH for these purposes) marked_accents = chars.extract_diacritic( chars.ACUTE, chars.CIRCUMFLEX ) # (Don't need GRAVE for these purposes) marked_length = chars.extract_diacritic(chars.LONG) # (Don't need SHORT for these purposes) h = marked_breathing(ch) acc = marked_accents(ch) etc = [ chars.diaeresis(ch), chars.iota_subscript(ch), marked_length(ch) ] out = chars.base(ch).lower() # Initialize out as base of character. if h != None and out != "ρ": # If any rough breathing, and not rho out = "h///" + out # insert an h/// before the base. # ('aspirated' rhos can be ignored, # and dealt with seperately.) out += "/" # Create 1st boundary if acc != None: # If any accent, place between 1st and 2nd boundary out += acc out += "/" # Create 2nd boundary for c in [c for c in etc if c != None]: # If any other diacritics, out += c # place between second and final boundary out += "/" # Create final boundary return out
def _parse_diacritics(self, ch): # Returns a string with seperated and organized diacritics # for easier access later. # EG: input with base a -> a/LENGTH/DIAERESIS/ out = chars.base(ch).lower() # Initialize out as base of character. length = chars.length(ch) dia = chars.diaeresis(ch) out += "/" # Create 1st boundary # If any length, place between 1st and 2nd boundary if length != None: out += length out += "/" # Create 2nd boundary if dia != None: # If any diaeresis, out += dia # place between second and final boundary out += "/" # Create final boundary return out
def save(self, *args, **kwargs): self.base_expansion = ''.join([base(i) for i in self.base_expansion]) if not self.image: return super(Symbol, self).save()
def strip_diacritics(s): """Removes all diacritics from the given string and returns it.""" return ''.join(base(c) for c in s)