pip install nltk==3.3 pip install spacy==2.0.11 python -m spacy download en_core_web_sm """ import random import nltk nltk.download("wordnet") # Downloading the wordnet corpus from nltk.corpus import wordnet as wn import spacy from spacy.lang.en import English # Making sure that the versions are exactly the same assert nltk.__version__ == "3.3" assert wn.get_version() == "3.0" assert spacy.__version__ == "2.0.11" def main(): # loading the tokenizer spacy_nlp = spacy.load("en_core_web_sm") tokenizer = English().Defaults.create_tokenizer(spacy_nlp) # Obtaining all the synsets and splitting them into train, dev and test. # Splitting along synsets (and not instances) is important to not taint the test data. all_synsets = list(wn.all_synsets()) random.seed(742382) random.shuffle(all_synsets) # 0.8/0.1/0.1 train/dev/test split split_index_train_dev = int(len(all_synsets) * 0.8)
try: nltk.download(token, quiet = True, raise_on_error = True) except ValueError: # Sometimes there are problems with the default index.xml URL. Then we will try this... from nltk.downloader import Downloader as NLTKDownloader d = NLTKDownloader("http://nltk.github.com/nltk_data/") d.download(token, quiet = True, raise_on_error = True) # Use the Brown corpus for calculating information content (IC) brown_ic = wn_ic.ic('ic-brown.dat') IC_CORPUS, IC_MAX = brown_ic, {} for key in IC_CORPUS: IC_MAX[key] = max(IC_CORPUS[key].values()) # This will hold the WordNet version VERSION = wn.get_version() or "3.0" #--------------------------------------------------------------------------------------------------- DIACRITICS = { "a": ("á", "ä", "â", "à", "å"), "e": ("é", "ë", "ê", "è"), "i": ("í", "ï", "î", "ì"), "o": ("ó", "ö", "ô", "ò", "ō", "ø"), "u": ("ú", "ü", "û", "ù", "ů"), "y": ("ý", "ÿ", "ý"), "s": ("š",), "c": ("ç", "č"), "n": ("ñ",), "z": ("ž",) }
df = find_synonyms(df) df = create_new_aspects_from_synonyms(df) df["aspect"] = flatten_column_lists(df["aspect"]) df["opinion"] = flatten_column_lists(df["opinion"]) df = reformat_output_file(df, 3) save_file(df, name + "_WORDNET_WSD") end = timer() logging.debug("Whole program: %.2f seconds" % (end - start)) # wsd_pywsd_simple_lesk(df) # wsd_pywsd_adapted_lesk(df) # find_synonyms(df) if __name__ == '__main__': logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) logging.debug("Wordnet version: %s" % wn.get_version()) logging.debug("Wordnet adjective: %s" % wn.ADJ) logging.debug("Wordnet verb: %s" % wn.VERB) logging.debug("Wordnet noun: %s" % wn.NOUN) logging.debug("Wordnet adverb: %s" % wn.ADV) argument = return_sys_arguments(sys.argv) if argument is None: print("You didn't give an argument") elif os.path.isdir(argument): files = read_folder_contents(argument) print("Gave a folder: %s, that has %s files." % (argument, str(len(files)))) x = 0 for f in files: x += 1 df = open_file(argument + "/" + f, "pandas")
from nltk.corpus import wordnet as wn try: wn.get_version() except: import nltk nltk.download('wordnet') import xml.etree.ElementTree as ET import logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') import logging.config logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': True, }) class SemcorReader: def read_sequences(self, in_file, limit=-1): root = ET.parse(in_file).getroot() for i, s in enumerate(root.findall('text/sentence')): if i == limit: break seq_tokens = []
from nltk.corpus import brown as brown_corpus from nltk.corpus import wordnet def die_nltk_data_error(corpus): sys.stderr.write("Missing nltk data (%s). User install_nltk_data.sh script\n" % corpus) sys.exit(1) try: brown_corpus.words() except LookupError: die_nltk_data_error('brown') try: wordnet.get_version() except LookupError: die_nltk_data_error('wordnet') def extract_keywords(prompt): tokens = nltk.word_tokenize(prompt) # build the brown freq dist - slow! fd = nltk.FreqDist(brown_corpus.words()) # decorate sort undecorate with freq tokens_with_freq = [(fd.freq(t), t) for t in tokens] for _, t in sorted(tokens_with_freq): print t
if __name__ == '__main__': similarity = wup T = 0.76 results_prefix = 'all_kadist_works' file_trials = 'data/all_annotated_trials.json' # annotated with tag_kadist_docs.py compute_person_metrics = False abbreviated = False abbreviated_size = 100 cluster_types = ['clusters', 'superclusters'] random.seed(42) print(' *', 'using WordNet version:', wordnet.get_version()) print(' *', 'using', 'similarity fn', similarity.__name__, 'T', T) print(' *', 'compute_person_metrics', compute_person_metrics) print(' *', 'results_prefix', results_prefix) if abbreviated: cluster_types = ['clusters'] print( ' *', 'abbreviated mode, limiting to {} (stable sample) trials'.format( abbreviated_size)) for cluster_type in cluster_types: file_clusters = f'data/{cluster_type}.json' with codecs.open(file_clusters, 'rb', 'utf-8') as f_clusters: clusters = preprocess_clusters(json.loads(f_clusters.read()))