def format_corpus(data, senses): """Format the corpus in a shape that could be analysed by the feature extractor""" spliter = re.compile(r"(\[\[[^\[\]|]*(?:\|[^\[\]|]*)?\]\])") matcher = re.compile(r"\[\[([^\[\]|]*)(?:\|([^\[\]|]*))?\]\]") tokenizer = re.compile(r"\W+", re.UNICODE) result = [] for d in data: res = [] for x in spliter.split(d): link = matcher.match(x) if link is None: tokens = word_tokenize(Wikipedia.clean_wikitext(x)) res.extend(tokens) else: label = link.group(2) sense = Wikipedia.normalize_title(link.group(1)) if label is None: label = link.group(1) if sense in senses: res.append((label, sense)) else: res.append(label) result.append(res) return result
def predictor(datadir, text): # List of ambiguous words filename_ambiguouswords = datadir + "/ambiguous_words.txt" with open(filename_ambiguouswords, 'r') as f: ambiguous_words = {x.rstrip() for x in f.readlines()} if "" in ambiguous_words: ambiguous_words.remove("") # Word tokenize results = [] words = np.array(word_tokenize(text.lower())) # Disambiguation for w in ambiguous_words: # Ambiguous word ambiguous_word = re.match(r"[^_]+", w).group(0).lower() # Feature extraction filename_features = datadir + "/feature_extractors/" + w + ".dump" if not os.path.isfile(filename_features): continue with open(filename_features, "rb") as f: ambiguous_extractor = pickle.load(f) ambiguous_data = ambiguous_extractor.extract_features(words, ambiguous_word) if ambiguous_data.data.shape[0] == 0: continue # Model prediction filename_models = datadir + "/models/" + w + ".dump" if not os.path.isfile(filename_models): continue with open(filename_models, "rb") as f: model = pickle.load(f) predictions = model.predict(ambiguous_data) for index, meaning in zip(ambiguous_data.targets, predictions): result = dict() result["begin"] = sum([len(words[i]) for i in range(index)]) result["end"] = result["begin"] + len(words[index]) result["all_senses"] = model.model.classes_.tolist() result["sense_index"] = result["all_senses"].index(meaning) result["url"] = "https://en.wikipedia.org/wiki/%s" % meaning results.append(result) # Return results return results
print("Done,", time.time() - t, "s") # Feature extraction (ambiguous text) print("************** Feature extraction (ambiguous text) ***************") t = time.time() text = """The bar of a mature tropical cyclone is a very dark gray-black layer of cloud appearing near the horizon as seen from an observer preceding the approach of the storm, and is composed of dense stratocumulus clouds. Cumulus and cumulonimbus clouds bearing precipitation follow immediately after the passage of the wall-like bar. Altostratus, cirrostratus and cirrus clouds are usually visible in ascending order above the top of the bar, while the wind direction for an observer facing toward the bar is typically from the left and slightly behind the observer.""" words = np.array(word_tokenize(text)) ambiguous_data = feature_extractor.extract_features(np.array(words), "bar") print("Shape of the matrix of features:", ambiguous_data.data.shape) print("Done,", time.time() - t, "s") print(ambiguous_data.data) print(ambiguous_data.targets) print(ambiguous_data.words) # Learning model print("************************* Learning model *************************") print("-- SVM with Linear Kernel (default parameters") t = time.time() model = LinearSVMClassifier() model.train(train_data) labels = model.predict(ambiguous_data)