Exemplo n.º 1
0
    def _parse_diacritics(self, ch: str) -> str:
        """

        EG: input with base a -> a/LENGTH/DIAERESIS/

        :param ch: character
        :return: a string with separated and organized diacritics for easier access later.
        """

        out = chars.base(ch).lower()  # Initialize out as base of character.

        length = chars.length(ch)
        dia = chars.diaeresis(ch)

        out += "/"  # Create 1st boundary

        # If any length, place between 1st and 2nd boundary
        if length:
            out += length

        out += "/"  # Create 2nd boundary

        if dia:  # If any diaeresis,
            out += dia  # place between second and final boundary

        out += "/"  # Create final boundary

        return out
Exemplo n.º 2
0
 def remove_accent(cls, txt: str) -> str:
     "remove accents from chars"
     txts: List[str] = []
     for t in txt:
         tclean = base(t)
         txts.append(tclean)
     return "".join(txts)
Exemplo n.º 3
0
    def remove_accent(self, txt: str) -> str:
        """!
        \brief remove accents from ancient greek characters

        \code

        >>> Text.remove_accent(ἄρχω)
        >>> αρχω

        \endcode
        """
        txts: List[str] = []
        for t in txt:
            tclean = base(t)
            txts.append(tclean)
        return "".join(txts)
Exemplo n.º 4
0
    def train(self):
        # at this point we extract (custom) statistical features from  the training dataset
        train_set = pd.read_csv('training dataset.csv', sep=';')

        for index, row in train_set.iterrows():
            # firstly, we preprocess every training tweet
            # make tweet's text lowercase
            processed_tweet = row['Tweet'].lower()
            # remove urls, usernames and hashtags via RegularExpression
            processed_tweet = re.sub(r"(http\S+)|(@\S+)", "", processed_tweet)
            # remove punctuation
            processed_tweet = processed_tweet.translate(
                str.maketrans(string.punctuation,
                              ' ' * len(string.punctuation)))
            # function that removes accent from a char
            buffer_str = ''
            for char in processed_tweet:
                unaccented_character = base(char)
                buffer_str = buffer_str + unaccented_character
            processed_tweet = buffer_str
            # remove some stantard abbreviations like 'rt', 'via', 'amp'
            processed_tweet = [
                term for term in processed_tweet.split() if term not in stop
            ]
            processed_tweet = ' '.join(processed_tweet)
            processed_tweet = emoji_pattern.sub(u'', processed_tweet)

            # now, we extract the features from training tweets
            # 'b' gets every position of a search query (keyword) within a tweet
            # and stores them in feat_b list
            b = next((b + 1 for b, t in enumerate(processed_tweet.split())
                      if t in search_queries), 0)
            c = next((c + 1 for c, t in enumerate(processed_tweet.split())
                      if t in places_dict.keys()), 0)
            extracted_features = {
                'feature_a': len(processed_tweet.split()),
                'feature_b': b,
                'feature_c': c
            }

            # we store our tweets' extracted features and their corresponding target values into lists
            self.train_attributes['features'].append(extracted_features)
            self.train_attributes['target'].append(row['Target'])
Exemplo n.º 5
0
    def _parse_diacritics(self, ch):
        # Returns a string with seperated and organized diacritics
        # for easier access later.
        # EG: input with base α -> α/ACCENT/ETC/
        # (where ETC includes diaeresis, iota subscripts, and macrons)

        # Additions to greek_accentuation.characters for use here:
        marked_breathing = chars.extract_diacritic(chars.ROUGH)
        # (Don't need SMOOTH for these purposes)
        marked_accents = chars.extract_diacritic(chars.ACUTE, chars.CIRCUMFLEX)
        # (Don't need GRAVE for these purposes)
        marked_length = chars.extract_diacritic(chars.LONG)
        # (Don't need SHORT for these purposes)

        h = marked_breathing(ch)
        acc = marked_accents(ch)
        etc = [
            chars.diaeresis(ch),
            chars.iota_subscript(ch),
            marked_length(ch)
        ]

        out = chars.base(ch).lower()  # Initialize out as base of character.

        if h != None and out != "ρ":  # If any rough breathing, and not rho
            out = "h///" + out  # insert an h/// before the base.
            # ('aspirated' rhos can be ignored,
            # and dealt with seperately.)

        out += "/"  # Create 1st boundary

        if acc != None:  # If any accent, place between 1st and 2nd boundary
            out += acc

        out += "/"  # Create 2nd boundary

        for c in [c for c in etc if c != None]:  # If any other diacritics,
            out += c  # place between second and final boundary

        out += "/"  # Create final boundary

        return out
Exemplo n.º 6
0
    def _parse_diacritics(self, ch):
        # Returns a string with seperated and organized diacritics
        # for easier access later.
        # EG: input with base α -> α/ACCENT/ETC/
        # (where ETC includes diaeresis, iota subscripts, and macrons)

        # Additions to greek_accentuation.characters for use here:
        marked_breathing = chars.extract_diacritic(chars.ROUGH)  
        # (Don't need SMOOTH for these purposes)
        marked_accents = chars.extract_diacritic(
            chars.ACUTE, chars.CIRCUMFLEX
        )  
        # (Don't need GRAVE for these purposes)
        marked_length = chars.extract_diacritic(chars.LONG)  
        # (Don't need SHORT for these purposes)

        h = marked_breathing(ch)
        acc = marked_accents(ch)
        etc = [
        chars.diaeresis(ch), chars.iota_subscript(ch), marked_length(ch)
        ]

        out = chars.base(ch).lower()  # Initialize out as base of character.

        if h != None and out != "ρ":  # If any rough breathing, and not rho
            out = "h///" + out  # insert an h/// before the base.
            # ('aspirated' rhos can be ignored,
            # and dealt with seperately.)

        out += "/"  # Create 1st boundary

        if acc != None:  # If any accent, place between 1st and 2nd boundary
            out += acc

        out += "/"  # Create 2nd boundary

        for c in [c for c in etc if c != None]:  # If any other diacritics, 
            out += c  # place between second and final boundary

        out += "/"  # Create final boundary

        return out
Exemplo n.º 7
0
    def _parse_diacritics(self, ch):
        # Returns a string with seperated and organized diacritics
        # for easier access later.
        # EG: input with base a -> a/LENGTH/DIAERESIS/
       
        out = chars.base(ch).lower()  # Initialize out as base of character.

        length = chars.length(ch)
        dia = chars.diaeresis(ch)

        out += "/"  # Create 1st boundary

        # If any length, place between 1st and 2nd boundary
        if length != None:  
            out += length

        out += "/"  # Create 2nd boundary

        if dia != None:  # If any diaeresis, 
            out += dia  # place between second and final boundary

        out += "/"  # Create final boundary

        return out
Exemplo n.º 8
0
    def save(self, *args, **kwargs):
        self.base_expansion = ''.join([base(i) for i in self.base_expansion])
        if not self.image:
            return

        super(Symbol, self).save()
Exemplo n.º 9
0
def strip_diacritics(s):
    """Removes all diacritics from the given string and returns it."""
    return ''.join(base(c) for c in s)