Пример #1
0
def cleanup_title(value):
    # Need to use this rather than .title() because .title()
    # does not handle things like "Wouldn't" properly. It
    # converts it to "Wouldn'T" rather than keeping the T
    # lowercase
    if value[0] == '"' or value[0] == "'":
        value = value[1:]
    if value[len(value) - 1] == '"' or value[len(value) - 1] == "'":
        value = value[:len(value) - 1]
    value = value.replace('"', "").strip()
    value = HTMLParser.HTMLParser().unescape(value.lower())
    en_us_locale = icu.Locale('en_US')
    break_iter = icu.BreakIterator.createTitleInstance(en_us_locale)
    temp_title = icu.UnicodeString(value)
    title = unicode(temp_title.toTitle(break_iter, en_us_locale))
    word_enders = [" ", ",", ".", ";", ":", '"', "'", "-"]
    for acronym in settings.COMPANY_ACRONYMS:
        if '.com' in acronym[0]:
            # .com often comes at the end of a title so we don't want to add
            # the trailing space check
            if acronym[1] in title:
                title = title.replace(acronym[1], acronym[0])
        else:
            if title.rfind(acronym[1]) == len(title) - len(acronym[1]):
                title = "%s%s" % (title[:len(title) -
                                        (len(acronym[1]))], acronym[0])
            for ender in word_enders:
                if "%s%s" % (acronym[1], ender) in title:
                    title = title.replace("%s%s" % (acronym[1], ender),
                                          "%s%s" % (acronym[0], ender))
    return title
Пример #2
0
def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr:
    builder = BistrBuilder(bs)
    us = icu.UnicodeString(bs.modified)
    offset = 0
    while not builder.is_complete:
        i = normalizer.spanQuickCheckYes(us)
        builder.skip(us.countChar32(0, i))
        if builder.is_complete:
            break
        us = us[i:]

        i = 0
        while i < len(us):
            if us.charAt(i) & 0xFC00 == 0xD800:
                i += 1
            i += 1
            if normalizer.hasBoundaryBefore(chr(us.char32At(i))):
                break

        chunk = us[:i]
        normalized = str(normalizer.normalize(chunk))
        builder.replace(chunk.countChar32(), normalized)
        us = us[i:]

    return builder.build()
Пример #3
0
    def __init__(self):

        # Graph where the nodes are unicode characters and the edges are "contains"
        # such that successors(尔) = [...你...]., and predecessors(你) = [亻,尔].
        # So, insert with self._graph.add_edge( "亻", "你" )
        #                 self._graph.add_edge( "尔", "你" )
        self._graph = nx.DiGraph()

        with open(babelstone.PATH_TO_IDS_TXT, encoding="UTF-8") as fp:
            for line in fp:
                # Ignore comments
                if line.startswith("#"):
                    continue
                # TODO(ambuc): ids.txt uses:
                # {1}, {2}, etc. to represent unencoded components.
                # ↔         as a mirror operator, i.e. to represent a component without
                #           a Unicode encoding, but whose mirror does have a Unicode
                #           encoding.
                # ↷        as a rotation operator, i.e. to represent a component
                #           without a Unicode encoding, but whose 180deg rotation does
                #           have a Unicode encoding.
                # 〾        as a variation indicator. We should try to handle these.
                # ?, ?     ids.txt uses these to represent an unencodable component.
                # We should probably try to handle these edge cases.
                elif re.search("[{}↔↷〾??]", line):
                    continue

                maybe_parsed_set = parse(str(icu.UnicodeString(line)))
                if maybe_parsed_set is not None:
                    self.insert(maybe_parsed_set)
Пример #4
0
def get_section_title(ch):
    nkfd_form = unicodedata.normalize('NFKD', unicode(ch))
    nkfd_ch = nkfd_form[0]
    cat = unicodedata.category(nkfd_ch)
    if 'L' != cat[0]:  # Not a letter
        return ''
    if 'l' != cat[1]:  # Not a lower-case letter (uppercase or special)
        return nkfd_ch
    return unicode(icu.UnicodeString(nkfd_ch).toUpper(lang_locale))
Пример #5
0
def _edit(bs: bistr, op: Callable, locale: Optional[str] = None) -> bistr:
    builder = BistrBuilder(bs)
    edits = icu.Edits()
    ucur = icu.UnicodeString(builder.current)

    if locale is None:
        umod = icu.UnicodeString(op(ucur, edits))
    else:
        umod = icu.UnicodeString(op(icu.Locale(locale), ucur, edits))

    for is_change, old_len, new_len, old_i, new_i, _ in edits.getFineIterator():
        old_len = ucur.countChar32(old_i, old_len)
        if is_change:
            repl = str(umod[new_i:new_i+new_len])
            builder.replace(old_len, repl)
        else:
            builder.skip(old_len)

    return builder.build()
Пример #6
0
    def tokenize(self, text: String) -> Tokenization:
        text = bistr(text)
        tokens = []

        bi = self._break_iterator()

        utext = icu.UnicodeString(text.modified)
        bi.setText(utext)

        ui = bi.first()
        uj = bi.nextBoundary()
        i = 0
        while uj != icu.BreakIterator.DONE:
            j = i + utext.countChar32(ui, uj - ui)
            if self._check_token(bi.getRuleStatus()):
                tokens.append(Token.slice(text, i, j))
            ui = uj
            uj = bi.nextBoundary()
            i = j

        return Tokenization(text, tokens)
regex_doublequotes = re.compile(r'\"+');

"""
To detect Multiword phrases inside a sentence we are using Gensim Library and it's models.phrases class 
https://radimrehurek.com/gensim/models/phrases.html. We have already made an object of the class and put all the tokenized reviews
in it and stored it in gensim_mulitphrases class (see generate_pickle_for_Gensim.py). Now we are loading object back from
pickle file
"""
with open("gensim_mulitphrases_"+Business+".txt", "rb") as fp:
    bigram = pickle.load(fp)
"""
The py2casefold in not tested enough, slow, and I couldn't install it in anaconda. This lambda function is a replacement.
The function returns a string, not a unicode, because we will use the .translate fast function to remove punct. of strings.
Source: https://stackoverflow.com/a/32838944/3429115
"""
casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode(CharsSet,"replace");
CharsSet = "ascii"; # The Character set to be used as the default one when interpreting texts

def getWordnetPos(_treebank_tag):
    """
    Translate the tree bank PoS tags to the WordNet's
    
    > Parameters:
        _treebank_tag : str     | The tag to be translated
    
    > Returns:
        The relevant WordNet PoS tag
    https://stackoverflow.com/a/15590384/3429115
    """
    if _treebank_tag.startswith('J'):
        return wordnet.ADJ
Пример #8
0
        spec = "".join(file(opts.input).readlines())
        brk = icu.RuleBasedBreakIterator(spec)
    else:
        brk = icu.RuleBasedBreakIterator()

    print(brk.getRules())

    if opts.codes:
        text = "".join(chr(int(x, 16)) for x in args)
    elif opts.file:
        text = "".join(file(args[0]).readlines())
    else:
        text = args[0]

    res = []
    brk.setText(icu.UnicodeString(text))
    last = brk.first()
    try:
        while True:
            next = brk.next()
            #            print(next, " ", brk.getRuleStatus())
            res.append(text[last:next])
            last = next
    except:
        res.append(text[last:])

    if opts.hex:
        print(f" {opts.separator} ".join(" ".join(hex(ord(x)) for x in res)
                                         for y in res))
    else:
        print(opts.separator.join(res))
import random

# The path to the main output folder, in which the text files are placed, and checks are made to avoid duplicated work:
directory = "./RestaurantsToOthers_Data"

regex_slash_newlines = re.compile(r'[\n\r\\]+')
regex_slash_tabs = re.compile(r'\t+')
regex_doublequotes = re.compile(r'\"+')
"""
The py2casefold in not tested enough, slow, and I couldn't install it in anaconda. This lambda function is a replacement.
The function returns a string, not a unicode, because we will use the .translate fast function to remove punct. of strings.
Source: https://stackoverflow.com/a/32838944/3429115
"""
CharsSet = "ascii"
# The Character set to be used as the default one when interpreting texts
casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode(
    CharsSet, "ignore")


def iter_sample_fast(iterable, samplesize):
    """
    Fast memory-efficient sampling method for pretty large iterables.
    Adopted from: https://stackoverflow.com/a/12583436/3429115
    
    > Parameters:
        * iterable: iterable object | The collection we want to sample from
        
        * samplesize: int           | How many samples to generate
        
    > Returns:
        List of samples, hoewver, since sampling is made without replacement (most probably), they aren't IID; but