Python word_tokenize示例

编程语言: Python

命名空间/包名称: ambiruptor.library.preprocessors.tokenizers

方法/功能: word_tokenize

hotexamples.com的示例: 3

Python word_tokenize - 已找到3个示例。这些是从开源项目中提取的最受好评的ambiruptor.library.preprocessors.tokenizers.word_tokenize现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： wiki_miners.py 项目： Ambiruptor/Ambiruptor

 def format_corpus(data, senses):
     """Format the corpus in a shape that could
     be analysed by the feature extractor"""
     spliter = re.compile(r"(\[\[[^\[\]|]*(?:\|[^\[\]|]*)?\]\])")
     matcher = re.compile(r"\[\[([^\[\]|]*)(?:\|([^\[\]|]*))?\]\]")
     tokenizer = re.compile(r"\W+", re.UNICODE)
     result = []
     for d in data:
         res = []
         for x in spliter.split(d):
             link = matcher.match(x)
             if link is None:
                 tokens = word_tokenize(Wikipedia.clean_wikitext(x))
                 res.extend(tokens)
             else:
                 label = link.group(2)
                 sense = Wikipedia.normalize_title(link.group(1))
                 if label is None:
                     label = link.group(1)
                 if sense in senses:
                     res.append((label, sense))
                 else:
                     res.append(label)
         result.append(res)
     return result

示例#2

显示文件

文件： utils.py 项目： Ambiruptor/ambiruptor-webapp

def predictor(datadir, text):    
    # List of ambiguous words
    filename_ambiguouswords = datadir + "/ambiguous_words.txt"
    with open(filename_ambiguouswords, 'r') as f:
        ambiguous_words = {x.rstrip() for x in f.readlines()}
        if "" in ambiguous_words:
            ambiguous_words.remove("")

    # Word tokenize
    results = []
    words = np.array(word_tokenize(text.lower()))

    # Disambiguation
    for w in ambiguous_words:

        # Ambiguous word
        ambiguous_word = re.match(r"[^_]+", w).group(0).lower()

        # Feature extraction
        filename_features = datadir + "/feature_extractors/" + w + ".dump"
        if not os.path.isfile(filename_features):
            continue
        with open(filename_features, "rb") as f:
            ambiguous_extractor = pickle.load(f)

        ambiguous_data = ambiguous_extractor.extract_features(words, ambiguous_word)

        if ambiguous_data.data.shape[0] == 0:
            continue

        # Model prediction
        filename_models = datadir + "/models/" + w + ".dump"
        if not os.path.isfile(filename_models):
            continue

        with open(filename_models, "rb") as f:
            model = pickle.load(f)
            predictions = model.predict(ambiguous_data)
            for index, meaning in zip(ambiguous_data.targets, predictions):
                result = dict()
                result["begin"] = sum([len(words[i]) for i in range(index)])
                result["end"] = result["begin"] + len(words[index])
                result["all_senses"] = model.model.classes_.tolist()
                result["sense_index"] = result["all_senses"].index(meaning)
                result["url"] = "https://en.wikipedia.org/wiki/%s" % meaning
                results.append(result)

    # Return results
    return results

示例#3

显示文件

文件： test.py 项目： Ambiruptor/Ambiruptor

        print("Done,", time.time() - t, "s")

        # Feature extraction (ambiguous text)
        print("************** Feature extraction (ambiguous text) ***************")
        t = time.time()

    text = """The bar of a mature tropical cyclone is a very dark gray-black
              layer of cloud appearing near the horizon as seen from an observer
              preceding the approach of the storm, and is composed of dense
              stratocumulus clouds. Cumulus and cumulonimbus clouds bearing
              precipitation follow immediately after the passage of the
              wall-like bar. Altostratus, cirrostratus and cirrus clouds are
              usually visible in ascending order above the top of the bar, while
              the wind direction for an observer facing toward the bar is
              typically from the left and slightly behind the observer."""
    words = np.array(word_tokenize(text))
    ambiguous_data = feature_extractor.extract_features(np.array(words), "bar")
    print("Shape of the matrix of features:", ambiguous_data.data.shape)
    print("Done,", time.time() - t, "s")

    print(ambiguous_data.data)
    print(ambiguous_data.targets)
    print(ambiguous_data.words)

    # Learning model
    print("************************* Learning model *************************")
    print("-- SVM with Linear Kernel (default parameters")
    t = time.time()
    model = LinearSVMClassifier()
    model.train(train_data)
    labels = model.predict(ambiguous_data)