Python DocumentFeature.smart_lower 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: discoutils.tokens

클래스/타입: DocumentFeature

메소드/함수: smart_lower

hotexamples.com에서의 예제들: 4

Python DocumentFeature.smart_lower - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 discoutils.tokens.DocumentFeature.smart_lower에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

from_string(30)

recompile_pattern(6)

smart_lower(4)

tokens_as_str(4)

예제 #1

파일 보기

파일: test_token.py 프로젝트: jt86/DiscoUtils

def test_smart_lower():
    # test that the PoS of an n-gram entry is not lowercased
    assert DocumentFeature.smart_lower('Cat/N') == 'cat/N'
    assert DocumentFeature.smart_lower('Cat/n') == 'cat/n'
    assert DocumentFeature.smart_lower('Red/J_CaT/N') == 'red/J_cat/N'
    assert DocumentFeature.smart_lower('Red/J CaT/N', separator=' ') == 'red/J cat/N'
    # test that features are not touched
    assert DocumentFeature.smart_lower('amod-DEP:former', lowercasing=False) == 'amod-DEP:former'

예제 #2

파일 보기

파일: apt_java_utils.py 프로젝트: mbatchkarov/dc_evaluation

def _read_vector(vector_file):
    bn = os.path.basename(vector_file)
    sent_file = os.path.join(os.path.dirname(vector_file), "%s.sent" % bn.split(".")[0])
    if not os.path.exists(sent_file):
        return "__MISSING__", {}

    with open(sent_file) as infile:
        phrase = " ".join(line.strip().split("\t")[1] for line in infile if line.strip())

    with gzip.open(vector_file) as infile:
        file_content = infile.readline().decode("utf8").strip().split("\t")
    features = [
        (DocumentFeature.smart_lower(word, lowercasing=True), float(count))
        for (word, count) in walk_nonoverlapping_pairs(file_content, beg=0)
    ]
    return phrase, features

예제 #3

파일 보기

파일: get_word2vec_vectors.py 프로젝트: mbatchkarov/vector_builder

 def __iter__(self):
     for fname in self.files:
         filename = join(self.dirname, fname)
         infile = gzip.open(filename) if is_gzipped(filename) else open(filename)
         with contextlib.closing(infile):
             for line in infile:
                 # yield gensim.utils.tokenize(line, lower=True)
                 if isinstance(line, bytes):
                     line = line.decode()
                 res = [DocumentFeature.smart_lower(w) for w in line.split() if
                        DocumentFeature.from_string(w).type != 'EMPTY']
                 if len(res) > 8:
                     # ignore short sentences, they are probably noise
                     if self.remove_pos:
                         yield [x.split('/')[0] for x in res]
                     else:
                         yield res

예제 #4

파일 보기

파일: thesaurus_loader.py 프로젝트: jt86/DiscoUtils

    def from_tsv(cls, tsv_file, sim_threshold=0, include_self=False,
                 lowercasing=False, ngram_separator='_', allow_lexical_overlap=True,
                 row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50,
                 max_neighbours=1e8, merge_duplicates=False, immutable=True,
                 enforce_word_entry_pos_format=True, tar=False, **kwargs):
        """
        Create a Thesaurus by parsing a Byblo-compatible TSV files (events or sims).
        If duplicate values are encoutered during parsing, only the latest will be kept.

        :param tsv_file: path to input TSV file
        :type tsv_file:  str
        :param sim_threshold: min similarity between an entry and its neighbour for the neighbour to be included
        :type sim_threshold: float
        :param include_self: whether to include self as nearest neighbour.
        :type include_self: bool
        :param lowercasing: if true, most of what is read will be lowercased (excluding PoS tags), so
            Cat/N -> cat/N. This is desirable when reading thesauri with this class. If False, no lowercasing
            will take place. This might be desirable when readings feature lists or already lowercased neighbour
            lists. FET + Byblo thesauri are already lowercased.
        :type lowercasing: bool
        :param ngram_separator: When n_gram entries are read in, what are the indidivual tokens separated by
        :param column_filter: A function that takes a string (column in the file) and returns whether or not
        the string should be kept
        :param row_filter: takes a string and its corresponding DocumentFeature and determines if it should be loaded.
        If `enforce_word_entry_pos_format` is `False`, the second parameter to this function will be `None`
        :param allow_lexical_overlap: whether neighbours/features are allowed to overlap lexically with the entry
        they are neighbours/features of. OTE: THE BEHAVIOUR OF THIS PARAMETER IS SLIGHTLY DIFFERENT FROM THE EQUIVALENT
        IN VECTORS. SEE COMMENT THERE.
        :param max_len: maximum length (in characters) of permissible **entries**. Longer entries are ignored.
        :param max_neighbours: maximum neighbours per entry. This is applied AFTER the filtering defined by
        column_filter and allow_lexical_overlap is finished.
        :param merge_duplicates: whether to raise en error if multiple entries exist, or concatenate/add them together.
        The former is appropriate for `Thesaurus`, and the latter for `Vectors`
        :param enforce_word_entry_pos_format: if true, entries that are not in a `word/POS` format are skipped. This
        must be true for `allow_lexical_overlap` to work.
        :param tar: whether the file is compressed by running `tar -zcvf file.gz file.txt`. Assuming the tar contains
        a single file.
        """

        if not tsv_file:
            raise ValueError("No thesaurus specified")

        to_return = dict()
        logging.info('Loading thesaurus %s from disk', tsv_file)
        gz_file = tsv_file + '.gz'
        if os.path.exists(gz_file) and tar:
            logging.warning('Using .gz version of thesaurus')
            tsv_file = gz_file
        if not allow_lexical_overlap:
            logging.warning('DISALLOWING LEXICAL OVERLAP')

        if not allow_lexical_overlap and not enforce_word_entry_pos_format:
            raise ValueError('allow_lexical_overlap requires entries to be converted to a DocumentFeature. '
                             'Please enable enforce_word_entry_pos_format')
        FILTERED = '___FILTERED___'.lower()

        if tar:
            tarf = tarfile.open(tsv_file, 'r')
            members = tarf.getmembers()
            if len(members) != 1:
                # todo this is odd, I don't know why it is happening
                # on some machine tar adds a second hidden file to the archive
                logging.warning('Tar archive contains multiple files: %r' % members)
                logging.warning('Using the last file in the tar')
            fhandle = tarf.extractfile(members[-1])
        else:
            fhandle = open(tsv_file)

        with fhandle as infile:
            for line in infile.readlines():
                if tar:
                    # this is a byte steam, needs to be decoded
                    tokens = line.decode('UTF8').strip().split('\t')
                else:
                    tokens = line.strip().split('\t')

                if len(tokens) % 2 == 0:
                    # must have an odd number of things, one for the entry
                    # and pairs for (neighbour, similarity)
                    logging.warning('Skipping dodgy line in thesaurus file: %s\n %s', tsv_file, line)
                    continue

                if tokens[0] != FILTERED:
                    key = DocumentFeature.smart_lower(tokens[0], ngram_separator, lowercasing)
                    dfkey = DocumentFeature.from_string(key) if enforce_word_entry_pos_format else None

                    if enforce_word_entry_pos_format and dfkey.type == 'EMPTY':
                        # do not load things in the wrong format, they'll get in the way later
                        logging.warning('%s is not in the word/POS format, skipping', tokens[0])
                        continue

                    if (not row_filter(key, dfkey)) or len(key) > max_len:
                        logging.warning('Skipping entry for %s', key)
                        continue

                    to_insert = [(DocumentFeature.smart_lower(word, ngram_separator, lowercasing), float(sim))
                                 for (word, sim) in walk_nonoverlapping_pairs(tokens, 1)
                                 if word.lower() != FILTERED and column_filter(word) and float(sim) > sim_threshold]

                    if not allow_lexical_overlap:
                        to_insert = cls.remove_overlapping_neighbours(dfkey, to_insert)

                    if len(to_insert) > max_neighbours:
                        to_insert = to_insert[:max_neighbours]

                    if include_self:
                        to_insert.insert(0, (key, 1.0))

                    # the steps above may filter out all neighbours of an entry. if this happens,
                    # do not bother adding it
                    if len(to_insert) > 0:
                        if key in to_return:  # this is a duplicate entry, merge it or raise an error
                            if merge_duplicates:
                                logging.warning('Multiple entries for "%s" found. Merging.', tokens[0])
                                c = Counter(dict(to_return[key]))
                                c.update(dict(to_insert))
                                to_return[key] = [(k, v) for k, v in c.items()]
                            else:
                                raise ValueError('Multiple entries for "%s" found.' % tokens[0])
                        else:
                            to_return[key] = to_insert
                    else:
                        logging.warning('Nothing survived filtering for %r', key)
        return Thesaurus(to_return, immutable=immutable)