Пример #1
0
def make_concept_uri(text, lang, disambiguation=None):
    text = ftfy.ftfy(text).strip()
    if disambiguation is None:
        text, disambiguation = handle_disambig(text)
    if disambiguation is not None:
        if isinstance(disambiguation, str):
            disambiguation = disambiguation.decode("utf-8")
        disambiguation = ftfy.ftfy(disambiguation)

    if lang == "en":
        normalized = english.normalize(text)
    elif lang == "ja" and disambiguation is not None:
        match = re.search(r"\((.*?)\)", disambiguation)
        if match:
            parenthesized = match.group(1)
            pos, rest = disambiguation.split("/", 1)
            if parenthesized in JAPANESE_PARTS_OF_SPEECH:
                pos = JAPANESE_PARTS_OF_SPEECH[parenthesized]
            else:
                pos = "n"
            disambiguation = pos + "/" + re.sub(r"\s*\((.*?)\)\s*", "", rest)
        normalized = preprocess_text(text).lower()
    else:
        normalized = preprocess_text(text).lower()

    if disambiguation is not None:
        disambiguation = disambiguation.strip().replace(" ", "_").lower()
    if disambiguation:
        return "/c/%s/%s/%s" % (lang, normalized.replace(" ", "_"), disambiguation)
    else:
        return "/c/%s/%s" % (lang, normalized.replace(" ", "_"))
Пример #2
0
def make_concept_uri(text, lang, disambiguation=None):
    text = ftfy.ftfy(text)
    if disambiguation is None:
        text, disambiguation = handle_disambig(text)
    if disambiguation is not None:
        if isinstance(disambiguation, str):
            disambiguation = disambiguation.decode('utf-8')
        disambiguation = ftfy.ftfy(disambiguation)

    if lang == 'en':
        normalized = english.normalize(text)
    elif lang == 'ja' and disambiguation is not None:
        match = re.search(r'\((.*?)\)', disambiguation)
        if match:
            parenthesized = match.group(1)
            pos, rest = disambiguation.split('/', 1)
            if parenthesized in JAPANESE_PARTS_OF_SPEECH:
                pos = JAPANESE_PARTS_OF_SPEECH[parenthesized]
            else:
                pos = 'n'
            disambiguation = pos + '/' + re.sub(r'\s*\((.*?)\)\s*', '', rest)
        normalized = preprocess_text(text).lower()
    else:
        normalized = preprocess_text(text).lower()

    if disambiguation is not None:
        disambiguation = disambiguation.replace(' ', '_')
    if disambiguation:
        return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambiguation)
    else:
        return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
Пример #3
0
 def _load_new_stream(cls, stream):
     worddict = defaultdict(int)
     for line in stream:
         word, freq = line.split(u',')
         word = preprocess_text(word).lower()
         worddict[word] += float(freq)
     return cls(dict(worddict))
Пример #4
0
    def analyze(self, text):
        """
        Run text through the external process, and get a list of lists
        ("records") that contain the analysis of each word.
        """
        try:
            text = UNSAFE_RE.sub('', preprocess_text(text)).strip()
            if not text:
                return []
            chunks = text.split('\n')
            results = []
            for chunk_text in chunks:
                if chunk_text.strip():
                    text = chunk_text.encode('utf-8')
                    self.send_input(text + '\n')
                    #self.input_log.write(text+'\n')
                    out_line = ''
                    while True:
                        out_line = self.receive_output_line()
                        #self.output_log.write(out_line)
                        out_line = out_line.decode('utf-8')

                        if out_line == u'\n':
                            break

                        record = out_line.strip(u'\n').split(u' ')
                        results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Пример #5
0
def get_frequency(word, lang, default_freq=0, scale=1e9):
    """
    Looks up a word's frequency in our preferred frequency list for the given
    language.

    >>> int(get_frequency('the', 'en', scale=42))
    42
    >>> int(get_frequency('normalization', 'en'))
    19566
    >>> int(get_frequency('Normalization', 'en'))
    19566
    >>> get_frequency('weirdification', 'en', 100.0)
    100.0
    """
    try:
        freqs = get_wordlist(lang)
    except ZeroDivisionError:
        return default_freq
    factor = scale / freqs.max_freq()

    if " " in word:
        raise ValueError("get_frequency only can only look up single words, "
                         "but %r contains a space" % word)

    lookup = preprocess_text(word).lower()
    return factor * freqs[lookup] + default_freq
Пример #6
0
def word_frequency(word, default_freq=0):
    """
    Looks up the word's frequency in a modified version of the Google Books
    1-grams list.

    The characters may be in any case (they'll be case-smashed
    to uppercase) and may include non-ASCII letters in UTF-8 or Unicode.

    Words appear in the list if they meet these criteria, which improve the
    compactness and accuracy of the list:

    - They consist entirely of letters, digits and/or ampersands
    - They contain at least one ASCII letter
    - They appear at least 1000 times in Google Books OR
      (they appear at least 40 times in Google Books and also appear in
      Wiktionary or WordNet)
    
    Apostrophes are assumed to be at the edge of the word,
    in which case they'll be stripped like they were in the Google data, or
    in the special token "n't" which is treated as "not". This matches the
    output of the tokenize() function.

    >>> word_frequency('normalization')
    223058.0

    >>> word_frequency('budap', default_freq=100.)
    100.0
    """
    freqs = Wordlist.load('google-unigrams.txt')
    if " " in word:
        raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word)
    word = preprocess_text(word.strip("'")).upper()
    if word == "N'T":
        word = 'NOT'
    return freqs.get(word, default_freq)
Пример #7
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        text = preprocess_text(text).lower()
        n_chunks = (len(text)+1024)//1024
        results = []
        for chunk in xrange(n_chunks):
            chunk_text = text[chunk*1024:(chunk+1)*1024].encode(self.mecab_encoding)
            self.mecab.stdin.write(chunk_text+'\n')
            #self.input_log.write(text+'\n')
            out_line = ''
            while True:
                out_line = self.mecab.stdout.readline()
                #self.output_log.write(out_line)
                out_line = out_line.decode(self.mecab_encoding)

                if out_line == u'EOS\n':
                    break

                word, info = out_line.strip(u'\n').split(u'\t')
                record = [word] + info.split(u',')
                
                # special case for detecting nai -> n
                if record[0] == u'ん' and record[5] == u'不変化型':
                    record[7] = record[1] = u'ない'

                results.append(record)
        return results
Пример #8
0
def get_frequency(word, lang, default_freq=0, scale=1e9):
    """
    Looks up a word's frequency in our preferred frequency list for the given
    language.

    >>> int(get_frequency('the', 'en', scale=42))
    42
    >>> int(get_frequency('normalization', 'en'))
    19566
    >>> int(get_frequency('Normalization', 'en'))
    19566
    >>> get_frequency('weirdification', 'en', 100.0)
    100.0
    """
    try:
        freqs = get_wordlist(lang)
    except ZeroDivisionError:
        return default_freq
    factor = scale / freqs.max_freq

    if " " in word:
        raise ValueError("get_frequency only can only look up single words, "
                         "but %r contains a space" % word)

    lookup = preprocess_text(word).lower()
    return factor * freqs[lookup] + default_freq
Пример #9
0
 def _load_new_stream(cls, stream):
     worddict = defaultdict(int)
     mode = None
     # We need to distinguish between two modes, to handle old and new
     # files:
     # 1. comma-separated linear frequency values
     # 2. tab-separated logarithmic values in dB
     for line in stream:
         if mode is None:
             if '\t' in line:
                 mode = 2
             elif ',' in line:
                 mode = 1
             else:
                 raise ValueError(
                     "I don't recognize the format of this wordlist file.")
         if mode == 1:
             word, freq = line.rstrip().split(',')
             freq = float(freq)
         elif mode == 2:
             word, freq = line.rstrip().split('\t')
             freq = 10**(float(freq) / 10)
         word = preprocess_text(word).lower()
         worddict[word] += freq
     return cls(dict(worddict))
Пример #10
0
 def _load_new_stream(cls, stream):
     worddict = defaultdict(int)
     for line in stream:
         word, freq = line.split(u',')
         word = preprocess_text(word).lower()
         worddict[word] += float(freq)
     return cls(dict(worddict))
Пример #11
0
 def _load_stream(cls, stream):
     worddict = {}
     mode = None
     # We need to distinguish between two modes, to handle old and new
     # files:
     # 1. comma-separated linear frequency values
     # 2. tab-separated logarithmic values in dB
     for line in stream:
         if mode is None:
             if '\t' in line:
                 mode = 2
             elif ',' in line:
                 mode = 1
             else:
                 raise ValueError(
                     "I don't recognize the format of this wordlist file."
                 )
         if mode == 1:
             word, freq = line.rstrip().split(',')
             freq = float(freq)
         elif mode == 2:
             word, freq = line.rstrip().split('\t')
             freq = 10**(float(freq)/10)
         word = preprocess_text(word).lower()
         worddict[word] = freq
     return cls(worddict)
Пример #12
0
    def word_frequency(self, word, default_freq=0):
        """
        Looks up the word's frequency in the Leeds Internet corpus for the
        appropriate language.

        FIXME: this returns 0 for words that stem differently in FreeLing when
        we use FreeLing frequencies, and that's most of the words
        """
        freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang)
        word = self.snowball_stem(word)
        if " " in word:
            raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word)
        word = preprocess_text(word.strip("'")).lower()
        return freqs.get(word, default_freq)
Пример #13
0
    def word_frequency(self, word, default_freq=0):
        """
        Looks up the word's frequency in the Leeds Internet corpus for the
        appropriate language.

        FIXME: this returns 0 for words that stem differently in FreeLing when
        we use FreeLing frequencies, and that's most of the words
        """
        freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang)
        word = self.snowball_stem(word)
        if " " in word:
            raise ValueError(
                "word_frequency only can only look up single words, but %r contains a space"
                % word)
        word = preprocess_text(word.strip("'")).lower()
        return freqs.get(word, default_freq)
Пример #14
0
def tag_and_stem(text):
    """
    Returns a list of (stem, tag, token) triples:

    - stem: the word's uninflected form
    - tag: the word's part of speech
    - token: the original word, so we can reconstruct it later
    """
    tokens = tokenize_list(preprocess_text(text))
    tagged = nltk.pos_tag(tokens)
    out = []
    for token, tag in tagged:
        if token in BRACKET_DIC:
            out.append((token, BRACKET_DIC[token], token))
        else:
            stem = morphy_stem(token, tag)
            out.append((stem, tag, token))
    return out
Пример #15
0
def tag_and_stem(text):
    """
    Returns a list of (stem, tag, token) triples:

    - stem: the word's uninflected form
    - tag: the word's part of speech
    - token: the original word, so we can reconstruct it later
    """
    tokens = tokenize_list(preprocess_text(text))
    tagged = nltk.pos_tag(tokens)
    out = []
    for token, tag in tagged:
        if token in BRACKET_DIC:
            out.append((token, BRACKET_DIC[token], token))
        else:
            stem = morphy_stem(token, tag)
            out.append((stem, tag, token))
    return out
Пример #16
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        try:
            self.process  # make sure things are loaded
            text = preprocess_text(text).replace('\n', ' ').lower()
            n_chunks = (len(text) + 1024) // 1024
            results = []
            for chunk in xrange(n_chunks):
                chunk_text = text[chunk * 1024:(chunk + 1) * 1024].encode(
                    self.mecab_encoding)
                self.send_input(chunk_text + '\n')
                #self.input_log.write(text+'\n')
                out_line = ''
                while True:
                    out_line = self.receive_output_line()
                    #self.output_log.write(out_line)
                    out_line = out_line.decode(self.mecab_encoding)

                    if out_line == u'EOS\n':
                        break

                    word, info = out_line.strip(u'\n').split(u'\t')
                    record_parts = [word] + info.split(u',')

                    # Pad the record out to have 10 parts if it doesn't
                    record_parts += [None] * (10 - len(record_parts))
                    record = MeCabRecord(*record_parts)

                    # special case for detecting nai -> n
                    if record.surface == u'ん' and record.conjugation == u'不変化型':
                        # rebuild the record so that record.root is 'nai'
                        record_parts[MeCabRecord._fields.index('root')] = u'ない'
                        record = MeCabRecord(*record_parts)

                    results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Пример #17
0
def get_frequency(word, lang, default_freq=0):
    """
    Looks up a word's frequency in our preferred frequency list for the given
    language.
    """
    word = preprocess_text(word)
    if lang == 'en':
        filename = 'google-unigrams.txt'
        word = word.upper()
    else:
        filename = 'leeds-internet-%s.txt' % lang
        word = word.lower()
    freqs = Wordlist.load(filename)

    if " " in word:
        raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word)
    # roman characters are in lowercase
    
    return freqs.get(word, default_freq)
Пример #18
0
def normalize_list(text):
    """
    Get a list of word stems that appear in the text. Stopwords and an initial
    'to' will be stripped, unless this leaves nothing in the stem.

    >>> normalize_list('the dog')
    [u'dog']
    >>> normalize_list('big dogs')
    [u'big', u'dog']
    >>> normalize_list('the')
    [u'the']
    """
    text = preprocess_text(text)
    pieces = [morphy_stem(word) for word in tokenize_list(text)]
    pieces = [piece for piece in pieces if good_lemma(piece)]
    if not pieces:
        return [text]
    if pieces[0] == 'to':
        pieces = pieces[1:]
    return pieces
Пример #19
0
def normalize_list(text):
    """
    Get a list of word stems that appear in the text. Stopwords and an initial
    'to' will be stripped, unless this leaves nothing in the stem.

    >>> normalize_list('the dog')
    [u'dog']
    >>> normalize_list('big dogs')
    [u'big', u'dog']
    >>> normalize_list('the')
    [u'the']
    """
    text = preprocess_text(text)
    pieces = [morphy_stem(word) for word in tokenize_list(text)]
    pieces = [piece for piece in pieces if good_lemma(piece)]
    if not pieces:
        return [text]
    if pieces[0] == 'to':
        pieces = pieces[1:]
    return pieces
Пример #20
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        try:
            self.process  # make sure things are loaded
            text = preprocess_text(text).replace('\n', '').lower()
            n_chunks = (len(text) + 1024) // 1024
            results = []
            for chunk in xrange(n_chunks):
                chunk_text = text[chunk * 1024:(chunk + 1) * 1024].encode(
                    self.mecab_encoding)
                self.send_input(chunk_text + '\n')
                #self.input_log.write(text+'\n')
                out_line = ''
                while True:
                    out_line = self.receive_output_line()
                    #self.output_log.write(out_line)
                    out_line = out_line.decode(self.mecab_encoding)

                    if out_line == u'EOS\n':
                        break

                    word, info = out_line.strip(u'\n').split(u'\t')
                    record = [word] + info.split(u',')

                    # special case for detecting nai -> n
                    if record[0] == u'ん' and record[5] == u'不変化型':
                        record[7] = u'ない'

                    results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Пример #21
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        try:
            self.process  # make sure things are loaded
            text = preprocess_text(text).lower()
            n_chunks = (len(text) + 1024) // 1024
            results = []
            for chunk in xrange(n_chunks):
                chunk_text = text[chunk * 1024 : (chunk + 1) * 1024].encode(self.mecab_encoding)
                self.send_input(chunk_text + "\n")
                # self.input_log.write(text+'\n')
                out_line = ""
                while True:
                    out_line = self.receive_output_line()
                    # self.output_log.write(out_line)
                    out_line = out_line.decode(self.mecab_encoding)

                    if out_line == u"EOS\n":
                        break

                    word, info = out_line.strip(u"\n").split(u"\t")
                    record = [word] + info.split(u",")

                    # special case for detecting nai -> n
                    if record[0] == u"ん" and record[5] == u"不変化型":
                        record[7] = record[1] = u"ない"

                    results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Пример #22
0
def word_frequency(word, default_freq=0):
    """
    Looks up the word's frequency in a modified version of the Google Books
    1-grams list.

    The characters may be in any case (they'll be case-smashed
    to uppercase) and may include non-ASCII letters in UTF-8 or Unicode.

    Words appear in the list if they meet these criteria, which improve the
    compactness and accuracy of the list:

    - They consist entirely of letters, digits and/or ampersands
    - They contain at least one ASCII letter
    - They appear at least 1000 times in Google Books OR
      (they appear at least 40 times in Google Books and also appear in
      Wiktionary or WordNet)

    Apostrophes are assumed to be at the edge of the word,
    in which case they'll be stripped like they were in the Google data, or
    in the special token "n't" which is treated as "not". This matches the
    output of the tokenize() function.

    >>> word_frequency('normalization')
    223058.0

    >>> word_frequency('budap', default_freq=100.)
    100.0
    """
    freqs = Wordlist.load('google-unigrams.txt')
    if " " in word:
        raise ValueError("word_frequency only can only look up single words, "
                         "but %r contains a space" % word)
    word = preprocess_text(word.strip("'")).lower()
    if word == "n't":
        word = 'not'
    return freqs.get(word, default_freq)