示例#1
0
 def format_corpus(data, senses):
     """Format the corpus in a shape that could
     be analysed by the feature extractor"""
     spliter = re.compile(r"(\[\[[^\[\]|]*(?:\|[^\[\]|]*)?\]\])")
     matcher = re.compile(r"\[\[([^\[\]|]*)(?:\|([^\[\]|]*))?\]\]")
     tokenizer = re.compile(r"\W+", re.UNICODE)
     result = []
     for d in data:
         res = []
         for x in spliter.split(d):
             link = matcher.match(x)
             if link is None:
                 tokens = word_tokenize(Wikipedia.clean_wikitext(x))
                 res.extend(tokens)
             else:
                 label = link.group(2)
                 sense = Wikipedia.normalize_title(link.group(1))
                 if label is None:
                     label = link.group(1)
                 if sense in senses:
                     res.append((label, sense))
                 else:
                     res.append(label)
         result.append(res)
     return result
示例#2
0
def predictor(datadir, text):    
    # List of ambiguous words
    filename_ambiguouswords = datadir + "/ambiguous_words.txt"
    with open(filename_ambiguouswords, 'r') as f:
        ambiguous_words = {x.rstrip() for x in f.readlines()}
        if "" in ambiguous_words:
            ambiguous_words.remove("")

    # Word tokenize
    results = []
    words = np.array(word_tokenize(text.lower()))

    # Disambiguation
    for w in ambiguous_words:

        # Ambiguous word
        ambiguous_word = re.match(r"[^_]+", w).group(0).lower()

        # Feature extraction
        filename_features = datadir + "/feature_extractors/" + w + ".dump"
        if not os.path.isfile(filename_features):
            continue
        with open(filename_features, "rb") as f:
            ambiguous_extractor = pickle.load(f)

        ambiguous_data = ambiguous_extractor.extract_features(words, ambiguous_word)

        if ambiguous_data.data.shape[0] == 0:
            continue

        # Model prediction
        filename_models = datadir + "/models/" + w + ".dump"
        if not os.path.isfile(filename_models):
            continue

        with open(filename_models, "rb") as f:
            model = pickle.load(f)
            predictions = model.predict(ambiguous_data)
            for index, meaning in zip(ambiguous_data.targets, predictions):
                result = dict()
                result["begin"] = sum([len(words[i]) for i in range(index)])
                result["end"] = result["begin"] + len(words[index])
                result["all_senses"] = model.model.classes_.tolist()
                result["sense_index"] = result["all_senses"].index(meaning)
                result["url"] = "https://en.wikipedia.org/wiki/%s" % meaning
                results.append(result)

    # Return results
    return results
示例#3
0
        print("Done,", time.time() - t, "s")

        # Feature extraction (ambiguous text)
        print("************** Feature extraction (ambiguous text) ***************")
        t = time.time()

    text = """The bar of a mature tropical cyclone is a very dark gray-black
              layer of cloud appearing near the horizon as seen from an observer
              preceding the approach of the storm, and is composed of dense
              stratocumulus clouds. Cumulus and cumulonimbus clouds bearing
              precipitation follow immediately after the passage of the
              wall-like bar. Altostratus, cirrostratus and cirrus clouds are
              usually visible in ascending order above the top of the bar, while
              the wind direction for an observer facing toward the bar is
              typically from the left and slightly behind the observer."""
    words = np.array(word_tokenize(text))
    ambiguous_data = feature_extractor.extract_features(np.array(words), "bar")
    print("Shape of the matrix of features:", ambiguous_data.data.shape)
    print("Done,", time.time() - t, "s")

    print(ambiguous_data.data)
    print(ambiguous_data.targets)
    print(ambiguous_data.words)

    # Learning model
    print("************************* Learning model *************************")
    print("-- SVM with Linear Kernel (default parameters")
    t = time.time()
    model = LinearSVMClassifier()
    model.train(train_data)
    labels = model.predict(ambiguous_data)