Exemplo n.º 1
0
def wnToBn(wn_labels, wordNet_to_babelNet, wordnetCompression):
    """
    Retrieve the list of labels containing BabelNet synsets

    :param wn_labels: list of labels with wordnet synsets
    :param  wordNet_to_babelNet: dictionary containing the mapping wn_synset => bn_synset
    :param wordnetCompression: boolean variable used to check whether is required to pass before from sensekey => wn_synsets and then to wn_synset => bn_synset or not.
    :return bn_label_synsets: list of labels with babelnet synsets
    
    """

    bn_label_synsets = []
    for label_sequence in wn_labels:
        temp = []
        for label in label_sequence:
            if wordnetCompression:
                if 'wn:' in label:
                    temp.append(wordNet_to_babelNet[label])
                else:
                    temp.append(label)
            else:
                if re.search(r'(%[1-9])', label):
                    pos = wn.lemma_from_key(label).synset().pos()
                    offset = wn.lemma_from_key(label).synset().offset()
                    synset_id = "wn:" + str(offset).zfill( 8) + pos
                    temp.append(wordNet_to_babelNet[synset_id])
                else:
                    temp.append(label)
        bn_label_synsets.append(temp)

    return bn_label_synsets
Exemplo n.º 2
0
def build_vocab_synset_SemCor():

    target_file = open(
        "../../WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt",
        "r")

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()

    # iterate through all definitions in the SemCor
    for line in target_file:

        # synset and literal definition from the WN
        key = line.replace('\n', '').split(' ')[-1]
        synset = wn.lemma_from_key(key).synset()

        # convert '.' to '__' for hashing
        synset = synset.name().replace('.', '__')
        vocab.add_word(synset)

    # add SemEval synsets
    semeval_file = open(
        "../../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt",
        "r")
    for line in semeval_file:
        key = line.replace('\n', '').split(' ')[-1]
        synset = wn.lemma_from_key(key).synset()
        synset = synset.name().replace('.', '__')
        vocab.add_word(synset)

    print("Total vocabulary size: {} {}".format(vocab.idx, len(vocab)))
    return vocab
Exemplo n.º 3
0
def add_word(vocab, word, production_rule, keys = None, aliases = None):
    """
    Get the synset for a word given its lemma key
    """

    if word is None or len(word.strip()) == 0:
        raise ConfigException("word can't be blank")

    word = word.strip().lower()

    import nltk, re
    from nltk.corpus import wordnet as wn
    from word import BaseWord

    synsets = []
    if keys != None:
        # Create a list of synsets for this word, if any lemmas were provided.
        if isinstance(keys, (list, tuple)):
            for k in keys:
                k = k.strip()
                s = wn.lemma_from_key(k).synset
                if s is None:
                    raise ConfigException('lemma_from_key("%s") returned None' % (keys.strip()))
                elif s not in synsets:
                    synsets.append(s)
        else:
            s = wn.lemma_from_key(keys.strip()).synset
            if s is None:
                raise ConfigException('lemma_from_key("%s") returned None' % (keys.strip()))
            elif s not in synsets:
                synsets.append(s)                

    alias_list = []

    try:
        # Make sure that the word itself is in its own alias list,
        # assuming it's a valid regular expression.
        re.compile(word)
        alias_list.append(word)
    except:
        pass

    # Create a list of aliases if applicable
    if aliases != None:
        if isinstance(aliases, (list, tuple)):
            for a in aliases:
                a = a.strip()
                if len(a) > 0 and a not in alias_list:
                    alias_list.append(a)
        else:
            a = aliases.strip()
            if len(a) > 0 and a not in alias_list:
                alias_list.append(a)

    # Finally create a BaseWord object based on all of this stuff
    baseword = BaseWord(word, production_rule, synsets, aliases)
    vocab.append(baseword)
Exemplo n.º 4
0
def patched_lemma_from_key(key, wordnet=wordnet):
    try:
        lemma = wordnet.lemma_from_key(key)
    except WordNetError as e:
        if key in patching_data:
            lemma = wordnet.lemma(patching_data[key])
        elif '%3' in key:
            lemma = wordnet.lemma_from_key(key.replace('%3', '%5'))
        else:
            raise e
    return lemma
Exemplo n.º 5
0
 def parseXsl(self, fname):
     f = xlrd.open_workbook(fname)
     f_sheet = f.sheet_by_index(0)
     translator = {}
     for row in range(1, f_sheet.nrows):
         verb_obj = f_sheet.cell(row, 0)
         noun_obj = f_sheet.cell(row, 3)
         v = verb_obj.value
         n = noun_obj.value
         translator[wn.lemma_from_key(v).synset()] = wn.lemma_from_key(
             n).synset()
     return translator
def make_wn2_wn3():
    key_map = {}
    id_map = {}
    curr_path = os.getcwd() + "/"
    _rel_path = curr_path.split('/data/')[0]
    print(_rel_path)
    _path = _rel_path + '/data/wn2.1_3.0'
    print("PTH", _path)
    mono_files = glob.glob(os.path.join(_path, '*.mono'))
    poly_files = glob.glob(os.path.join(_path, '*.poly'))

    for file_name in mono_files:
        with open(file_name) as fp:
            for each in fp.readlines():
                wn2_sense_key , wn2_synset_offs ,\
                    wn3_sense_key , wn3_synset_offs = each.strip().split(' ')

                ref_syn = wn.lemma_from_key(wn3_sense_key).synset()
                wn2_word = ref_syn.name().split('.')[0]
                wn2_pos = ref_syn.pos()
                key_map[wn2_sense_key] = [wn3_sense_key]
                id_map[(wn2_word , wn2_pos ,1)] = (wn3_sense_key , \
                                                        wn3_synset_offs , 1)

    for file_name in poly_files:
        with open(file_name) as fp:
            for each in fp.readlines():
                all_keys = each.strip().split(' ')
                wn2_sense_key , wn2_synset_offs , wn2_sense_number = \
                                                            all_keys[1].split(';')
                wn3_vals = [tuple(each.split(';')) \
                                for each in all_keys[2:]]
                key_vals = []
                id_vals = []
                for wn3_sense_key, wn3_synset_offs, wn3_sense_number in wn3_vals:
                    if wn3_sense_key:
                        id_vals.append((wn3_sense_key ,wn3_synset_offs ,\
                                                        wn3_sense_number))
                        key_vals.append(wn3_sense_key)

                if key_vals:
                    ref_syn = wn.lemma_from_key(key_vals[0]).synset()
                    wn2_word = ref_syn.name().split('.')[0]
                    wn2_pos = ref_syn.pos()
                    key_map[wn2_sense_key] = key_vals
                    id_map[(wn2_word, wn2_pos, wn2_sense_number)] = id_vals

    with open(_rel_path + "/data/wn3_id_map", "wb") as file_handle:
        pickle.dump(id_map, file_handle)

    with open(_rel_path + "/data/wn3_key_map", "wb") as file_handle:
        pickle.dump(key_map, file_handle)
    return id_map, key_map
Exemplo n.º 7
0
def lemmaToWNsynset(sentences, labels, isTrain):
    """

    Transform every lemma associated to a sensekey in a wordnet synset. 
    Then add the mapping lemma => wn_synset, wn_synset => sensekey in two dictionaries dictionary that will be saved as a .txt file.
    
    :param sentences: list of sentences
    :param labels: list of labels composed by lemmas and sensekeys
    :param isTrain: boolean variable used to distinguish between training and dev set operations
    :param wordnetCompression: boolean variable used to check whether the wordnet_synset => sensekeys mapping must be saved
    :return updated_labels = list of labels with lemmas and wn_synsets 
    
    """
    lemma_to_wn = {} #Dictionary containing the mapping lemma => wordnet synset
    updated_labels = []

    #For every sentence of the dataset
    for i in range(len(sentences)):
        temp = []
        current_label_sequence= labels[i]
        current_sentence = sentences[i]
        #For every token in the current sentence
        for j in range(len(current_sentence)):
            lemma = current_sentence[j]
            label = current_label_sequence[j]
            #Check if the label is a sensekey
            if re.search(r'(%[1-9])', label):
                #From the sensekey extract the synset
                pos = wn.lemma_from_key(label).synset().pos()
                offset = wn.lemma_from_key(label).synset().offset()
                wn_synset = "wn:" + str(offset).zfill( 8) + pos
                #Add pair (lemma, wordnet_synset) to the dictionary
                if not lemma in lemma_to_wn:
                    lemma_to_wn[lemma] = [wn_synset]
                else:
                    if not wn_synset in lemma_to_wn[lemma]:
                        lemma_to_wn[lemma].append(wn_synset)
                lemma = wn_synset
            temp.append(lemma)
        updated_labels.append(temp)

    #If we worked on the training set, save the dictionary into two files
    if isTrain:
        if not os.path.exists("../../resource/Mapping_Files/lemma_to_wn.txt"):
            with open("../../resource/Mapping_Files/lemma_to_wn.txt", 'w') as file:
                for elem in lemma_to_wn:
                    line = elem + " " + " ".join(lemma_to_wn[elem])
                    file.write(line + "\n")

    return updated_labels
Exemplo n.º 8
0
def extractLabelIdentifier(sentences, ids, lemmas_mapping, vocab_identifier,
                           wordnetCompression):
    """
    Every lemma to disambiguate is associated to a label which is mapped to an integer.
    This mapping is contained in the vocab_identifier variable- Lemmas to disambiguate are first searched in the "lemmas_mapping" dictionary.
    If they are not present there the MSF (most frequent sense) method is applied. Thus their label is recovered from the WordNet interface.
    
    :param sentences: list of tokenized sentences
    :param ids: list of identifier for lemmas to predict
    :param lemmas_mapping: lemma => label mapping
    :param vocab_identifier: label => integer mapping
    :param wordnetCompression: boolean variable used to check if a model with the wordnet synset compression method has been used
    :return identifiers_list: list of list of integers and sensekey; each internal list correspond to all the sensekey identifier associated to a lemma
    """

    identifiers_list = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        id_sequence = ids[i]
        sentence_ids = []
        for j in range(len(sentence)):
            word = sentence[j]
            id = id_sequence[j]
            word_ids = []
            #Check if the current word was met during training
            if not id == '0':
                if word in lemmas_mapping:
                    #If it is, ectract the sensekeys associated to the lemma
                    sensekeys = lemmas_mapping[word]
                    #Then search for all the sensekeys their identifier
                    for sensekey in sensekeys:
                        word_ids.append(vocab_identifier[sensekey])
                else:
                    #Take the most frequent sense from wordnet
                    mfs = str(wn.synsets(word)[0])[8:-2]
                    #Retrieve the correspondent sensekey
                    sensekey = wn.synset(mfs).lemmas()[0].key()
                    if wordnetCompression:
                        #Transform the senekey into a wordnet synset
                        pos = wn.lemma_from_key(sensekey).synset().pos()
                        offset = wn.lemma_from_key(sensekey).synset().offset()
                        wn_synset = "wn:" + str(offset).zfill(8) + pos
                        word_ids.append(wn_synset)
                    else:
                        word_ids.append(sensekey)
            if word_ids:
                sentence_ids.append(word_ids)
        identifiers_list.append(sentence_ids)

    return identifiers_list
Exemplo n.º 9
0
def create_lookup_entry(lemma, dante_answer, answer_id):
    """
    Returns True if it failed
    """
    try:
        if lemma == "U":
            wn_definition = "NO DEFINITION FOUND - CHECK CONTEXT"
        else:
            wn_definition = wn.lemma_from_key(lemma).synset.definition
        write_string = str("For answer: " + answer_id +
                           "\nAre these definitions of " +
                           dante_answer["headword"] + " the same?\n" +
                           wn_definition + "\n          AND\n" +
                           dante_answer["meaning"] + "\n")
        yes_or_no = None
        while yes_or_no != "y" and yes_or_no != "n":
            stdout.write("\r%s" % write_string)
            yes_or_no = raw_input("y or n?: ")
            stdout.flush()
            sleep(1)
        if yes_or_no == "y":
            DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], True)
        elif yes_or_no == "n":
            DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], False)
        else:
            raise ValueError("Wrong input squeezed through")
        stdout.write("\n")
        return True
    except WordNetError as err:
        synsets = wn.synsets(dante_answer["headword"])
        print "head word " + dante_answer[
            "headword"] + " and lemma " + lemma + " caused error: " + err.message
        print "Synsets: " + str(synsets)
        print "FIX IT BEFORE NEXT RUN"
        return False
Exemplo n.º 10
0
def _disambiguation():
    ev = [
        'noun.phenomenon', 'noun.act', 'noun.event', 'noun.attribute',
        'adj.all', 'adv.all'
    ]
    true_splits = []
    with cd(path.join('lib', 'ims_0.9.2.1')):
        pattern = '<x(.+?)</x>'
        with open('out_.txt', 'r',
                  encoding='utf-8') as f, open("test_.txt",
                                               'r',
                                               encoding='utf-8') as f1:
            for line, line1 in zip(f, f1):
                matches = re.finditer(pattern, line)
                lexnames = []
                for m in matches:
                    key = re.search('(?<=\s)([^ ]+?)(?=\|)', m[0]).group(
                        0)  # for '   natural%3:00:03::|'
                    try:
                        lexname = wn.lemma_from_key(key).synset().lexname()
                        lexnames.append(lexname)
                    except WordNetError:
                        print(key)
                print(lexnames)
                print(line1)
                if set(lexnames).intersection(set(ev)):
                    true_splits.append(line1.strip())
    print('Disambiguation: ', true_splits)
    return true_splits
def convert_goldkey2domain(input_path: str,
                           output_path: str,
                           bn2domain: Dict[str, str],
                           is_bn: bool = True) -> None:
    """
    This method is used to convert a goldkey map
    :param input_path: path of goldkey to convert
    :param output_path: path where it writes
    :param bn2domain: a map Babelnet 2 domain
    :param is_bn: if True, it converts in Babelnet format
                if False, it converts in coarse-grained format
    :return: None
    """

    wn2bn = read_map(config.BABELNET2WORDNET_TR, reverse=True)
    with open(str(output_path), mode="w") as out_file:
        with open(str(input_path), mode="r") as in_file:
            for line in in_file:
                line = line.strip().split()

                syn = wn.lemma_from_key(line[1]).synset()
                syn = "wn:" + str(syn.offset()).zfill(8) + syn.pos()
                syn = _get_frequent_sense(syn, wn2bn, bn2domain, is_bn)

                out_file.write(line[0] + " " + syn + "\n")
Exemplo n.º 12
0
def create_labels_words(path=path_dataset + "semcor+omsti.gold.key.txt"):
    ###############################################################################
    # This function, given the gold file, creates a dictionary of labels (babelnet id, Wndomain, Lexname)
    # for each ambiguous words
    #
    # Input:
    #   path: path of the gold file
    #
    # Output:
    #   dict_sensekey: dictionary of labels
    ###############################################################################

    sense_keys = [sensekey.split() for sensekey in utils.load_txt(path)]

    dict_sensekey = {}

    for list_info in sense_keys:
        # take the synset from the sense key
        synset = wn.lemma_from_key(list_info[1]).synset()
        # take the wordnet id from the sense key
        wn_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos()
        bn_id = Wordnet_match[wn_id]

        try:
            dict_sensekey[list_info[0]] = [
                bn_id, Wndomain_match[bn_id], lexname_match[bn_id]
            ]

        # add the factotum label to all the words which don't have a wndomain label
        except:
            dict_sensekey[list_info[0]] = [
                bn_id, "factotum", lexname_match[bn_id]
            ]

    return dict_sensekey
Exemplo n.º 13
0
def parse_corpus(path_name='Training_Corpora/SemCor/'):
    """This function parse the training data"""
    keys_path = glob.glob(os.path.join(path_name, '*gold.key.txt'))[0]
    sentences_path = glob.glob(os.path.join(path_name, '*data.xml'))[0]

    keys = dict()
    with open(keys_path, 'r') as f:
        for line in f:
            line = line.strip().split(' ')
            id_ = line[0]
            synset_keys = line[1:]
            synsets = [wn.lemma_from_key(k).synset() for k in synset_keys]
            keys[id_] = synsets

    with open(sentences_path, 'r') as f:
        tree = etree.parse(f)

    training = []

    for sentence in tree.xpath('//sentence'):
        sent_id = sentence.attrib['id']
        tags = []
        words = []
        for chunck in sentence[:]:
            type_ = chunck.tag
            if type_ == 'instance':
                tags.append(keys[chunck.attrib['id']])
            if chunck.attrib['pos'] != '.' and chunck.attrib['lemma'] not in STOPWORD:
                words.append(chunck.attrib['lemma'])
        training.append((words, list(set(itertools.chain.from_iterable(tags)))))
    return training
Exemplo n.º 14
0
def lemma_from_key(key):
    """
        This function returns lemma object given the lemma key.
        This is similar to :func:`lemma` but it needs to supply the key
        of lemma instead of the name.

        .. note::
            Support only English language (*eng*).

        :param str key: key of the lemma object

        :return: lemma object with the given key
        :rtype: :class:`Lemma`

        :Example:

            >>> from pythainlp.corpus.wordnet import lemma, lemma_from_key
            >>>
            >>> practice = lemma('practice.v.01.exercise')
            >>> practice.key()
            exercise%2:41:00::
            >>> lemma_from_key(practice.key())
            Lemma('practice.v.01.exercise')
    """
    return wordnet.lemma_from_key(key)
Exemplo n.º 15
0
def calc_fiwn_counts():
    en2fi = {}
    for (fi_synset_key, fi_lemma_str, en_synset_key, en_lemma_str, rel,
         extra) in get_transl_iter():
        if rel != "synonym":
            continue

        fi_lemma = get_lemma(fiwn, fi_synset_key, fi_lemma_str)
        assert fi_lemma is not None

        en_lemma = get_lemma(wordnet, en_synset_key, en_lemma_str)
        assert en_lemma is not None

        en2fi.setdefault(en_lemma.key(), []).append(fi_lemma.key())
    divisors = set()
    counts = {}
    for en, fis in en2fi.items():
        for fi in fis:
            counts.setdefault(fi, 0.0)
            try:
                en_lemma = wordnet.lemma_from_key(en)
            except WordNetError:
                # The following lemmas are not in the PWN sense index for some reason:
                # ['earth%1:17:02::', 'ddc%1:06:01::', 'kb%1:23:01::', 'sun%1:17:02::',
                # 'moon%1:17:03::', 'earth%1:15:01::', 'ddi%1:06:01::', 'kb%1:23:03::']
                pass
            else:
                div = len(fis)
                divisors.add(div)
                counts[fi] += en_lemma.count() / div
    mult = reduce(lcm, divisors)
    for lemma, cnt in counts.items():
        counts[lemma] = int((cnt * mult) + 0.5)
    return counts
Exemplo n.º 16
0
def create_lookup_entry(lemma, dante_answer, answer_id):
    """
    Returns True if it failed
    """
    try:
        if lemma == "U":
            wn_definition = "NO DEFINITION FOUND - CHECK CONTEXT"
        else:
            wn_definition = wn.lemma_from_key(lemma).synset.definition
        write_string = str("For answer: " + answer_id + "\nAre these definitions of " + dante_answer["headword"] +
                           " the same?\n" + wn_definition + "\n          AND\n" + dante_answer["meaning"] + "\n")
        yes_or_no = None
        while yes_or_no != "y" and yes_or_no != "n":
            stdout.write("\r%s" % write_string)
            yes_or_no = raw_input("y or n?: ")
            stdout.flush()
            sleep(1)
        if yes_or_no == "y":
            DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], True)
        elif yes_or_no == "n":
            DanteAPI.write_to_lookup(lemma, dante_answer["meaning"], False)
        else:
            raise ValueError("Wrong input squeezed through")
        stdout.write("\n")
        return True
    except WordNetError as err:
        synsets = wn.synsets(dante_answer["headword"])
        print "head word " + dante_answer["headword"] + " and lemma " + lemma + " caused error: " + err.message
        print "Synsets: " + str(synsets)
        print "FIX IT BEFORE NEXT RUN"
        return False
Exemplo n.º 17
0
 def get_synset_from_sense_key(sense_key):
     try:
         lemma = wn.lemma_from_key(sense_key)
     except:
         # print("no synset found for: " + sense_key)
         return "NA"
     return str(lemma.synset())
Exemplo n.º 18
0
    def wsd_input_format(wsd_src_data, eval_result):
        '''
        test_data[0] {'target_word': u'art#n', 'target_sense': None, 'id': 'senseval2.d000.s000.t000', 'context': ['the', '<target>', 'of', 'change_ringing', 'be', 'peculiar', 'to', 'the', 'english', ',', 'and', ',', 'like', 'most', 'english', 'peculiarity', ',', 'unintelligible', 'to', 'the', 'rest', 'of', 'the', 'world', '.'], 'poss': ['DET', 'NOUN', 'ADP', 'NOUN', 'VERB', 'ADJ', 'PRT', 'DET', 'NOUN', '.', 'CONJ', '.', 'ADP', 'ADJ', 'ADJ', 'NOUN', '.', 'ADJ', 'PRT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', '.']}
        '''
        wsd_input = []
        senses_input = []

        for i in range(len(eval_result)):
            block = i / 32
            src_word1, src_word2 = backward_step1_in[
                2 * block], backward_step1_in[2 * block + 1]
            tgt_sent = wsd_src_data[i].decode().encode(
                'utf-8') + ' ' + eval_result[i]
            tgt_word = src_word1

            synset = wn.lemma_from_key(tgt_word).synset()
            s = synset.name()
            target_word = '#'.join(s.split('.')[:2])
            context = tgt_sent.split(' ')

            for j in range(len(context)):
                if context[j] == tgt_word:
                    context[j] = '<target>'
            poss_list = ['.' for _ in range(len(context))]
            tmp_dict = {
                'target_word': target_word,
                'target_sense': None,
                'id': None,
                'context': context,
                'poss': poss_list
            }
            wsd_input.append(tmp_dict)
            senses_input.append((src_word1, src_word2))
        return wsd_input, senses_input
Exemplo n.º 19
0
def parse_corpus(path_name='Training_Corpora/SemCor/'):
    """This function parse the training data"""
    keys_path = glob.glob(os.path.join(path_name, '*gold.key.txt'))[0]
    sentences_path = glob.glob(os.path.join(path_name, '*data.xml'))[0]

    keys = dict()
    with open(keys_path, 'r') as f:
        for line in f:
            line = line.strip().split(' ')
            id_ = line[0]
            synset_keys = line[1:]
            synsets = [wn.lemma_from_key(k).synset() for k in synset_keys]
            keys[id_] = synsets

    with open(sentences_path, 'r') as f:
        tree = etree.parse(f)

    training = []

    for sentence in tree.xpath('//sentence'):
        sent_id = sentence.attrib['id']
        tags = []
        words = []
        for chunck in sentence[:]:
            type_ = chunck.tag
            if type_ == 'instance':
                tags.append(keys[chunck.attrib['id']])
            if chunck.attrib['pos'] != '.' and chunck.attrib[
                    'lemma'] not in STOPWORD:
                words.append(chunck.attrib['lemma'])
        training.append(
            (words, list(set(itertools.chain.from_iterable(tags)))))
    return training
Exemplo n.º 20
0
def wordnet_process(filename):
        '''
        {'target_word': u'picture#n', 'target_sense': None, 'id': None, 'context': [u'<unk>', u'is', u'the', '<target>', u'of', u'the', u'<unk>', u'and', u'the', u'<unk>'], 'poss': ['.', '.', '.', '.', '.', '.', '.', '.', '.', '.']}
        '''
        dict_list = []
        with open(filename) as f1:
            for i, line in enumerate(f1):
                wordlist = line.strip().split()
                dict = {}
                target_id = -1
                target = ''
                for id, word in enumerate(wordlist):
                    if '%' in word:
                        name = wn.lemma_from_key(word).synset().name().encode('utf-8').split('.')
                        target_id = id
                        # target = name[0]+'#'+name[1]
                        target = word[:word.index('%')]+'#'+name[1]
                        break
                dict['target_word'] = target
                dict['target_sense'] = None
                dict['id'] = None
                dict['context'] = []
                for id, word in enumerate(wordlist):
                    if id == target_id:
                        dict['context'].append('<target>')
                    else:
                        dict['context'].append(word)
                dict['poss'] = ['.' for i in range(0, len(wordlist))]
                dict_list.append(dict)
        with open(filename+'.pickle', 'wb') as fp:
            pickle.dump(dict_list, fp)
def sk_to_ss():
    """Update sensekey in tag file to synsetID (offset-pos)"""
    all_sk = set()
    print("Reading tag file ...")
    with open(SEMCOR_TAG, 'r') as semcor_tag:
        lines = [ x.split() for x in semcor_tag.readlines() ]
    for line in lines:
        sk = line[3]
        scloc = sk.find(';')
        if scloc > -1:
            sk = sk[:scloc] # only consider the first sensekey
        all_sk.add(sk)
    print(len(all_sk))

    print("Loading WordNet ...")
    from nltk.corpus import wordnet as wn
    all_sk_notfound = set()
    with open(SS_SK_MAP, 'w') as mapfile:
        for sk in all_sk:
            try:
                if sk not in all_sk_notfound:
                    ss = wn.lemma_from_key(sk).synset()
                    sid = '%s-%s' % (ss.offset(), ss.pos())
                    mapfile.write('%s\t%s\n' % (sk, sid))
            except nltk.corpus.reader.wordnet.WordNetError:
                all_sk_notfound.add(sk)
            except ValueError:
                print("Invalid sk: %s" % (sk,))
                all_sk_notfound.add('[INVALID]\t' + sk)
    with open(SK_NOTFOUND, 'w') as notfoundfile:
        for sk in all_sk_notfound:
            notfoundfile.write(sk)
            notfoundfile.write('\n')
    print("Map file has been created")
Exemplo n.º 22
0
def sc2ss(sensekey):
    '''Look up a synset given the information from SemCor'''
    ### Assuming it is the same WN version (e.g. 3.0)
    try:
        return wn.lemma_from_key(sensekey).synset()
    except:
        pass
Exemplo n.º 23
0
def evaluate(instance, sentence, golds):
    preds = predict(instance, sentence)
    res = []
    for p, g in zip(preds, golds):
        g = [wordnet.lemma_from_key(i).synset() for i in g[1]]
        res.append(p in g)
    return res
Exemplo n.º 24
0
 def lemmas(self):
     '''Return lemmas tied to the synsets'''
     if self.__BNLANG:
         return [s.getLemmas(self.__BNLANG) for s in self.synsets()]
     else:
         from nltk.corpus import wordnet as wn
         print(self.nearest)
         return [wn.lemma_from_key(k) for k, d in self.nearest]
Exemplo n.º 25
0
def lemma_to_synset_key(keyin, keyout):
    for line in keyin:
        inst_id, lemma_ids = line.split(" ", 1)
        keyout.write(inst_id)
        for lemma_id in lemma_ids.split():
            keyout.write(
                " " + wordnet.ss2of(wordnet.lemma_from_key(lemma_id).synset()))
        keyout.write("\n")
Exemplo n.º 26
0
 def get_tokens(self, in_file):
     root = ET.parse(in_file).getroot()
     for i, s in enumerate(root.findall('document/paragraph/sentence')):
         for t in s:
             synset_labels, lexname_labels = [], []
             if 'wn30_key' in t.attrib:
                 sensekey = t.attrib['wn30_key']
                 try:
                     synset = wn.lemma_from_key(sensekey).synset()
                 except Exception as e:
                     sensekey = sensekey.replace(
                         '%3', '%5')  # a fix for unprocessable satellites
                     synset = wn.lemma_from_key(sensekey).synset(
                     )  # now, we should be able to find the modified sensekey in WN
                 synset_labels.append(synset.name())
                 lexname_labels.append(synset.lexname())
             yield synset_labels, lexname_labels, t.attrib['surface_form']
Exemplo n.º 27
0
def create_dictionary(dataset_name, gold2dic, train=False):
    words = []
    wordnet = []

    # get an iterable
    context = etree.iterparse(dataset_name)

    sentence = []
    sentenceNet = []
    deletedSentences = 0
    dictionary = {}

    for event, elem in iter(context):

        if elem.tag == "sentence":

            #if(int(elem.attrib['id'])%10 == 0)
            print('\t' + elem.attrib['id'])

            if len(sentence) < 1:
                deletedSentences += 1
            else:
                words.append(' '.join(sentence))
                wordnet.append(' '.join(sentenceNet))
            sentence = []
            sentenceNet = []

        elif elem.tag == "wf" or elem.tag == "instance":
            lemma = elem.attrib["lemma"].lower()
            sentence.append(lemma)

            if elem.tag == "instance":
                dataset_id = elem.attrib["id"]
                synset = wn.lemma_from_key(gold2dic[dataset_id]).synset()
                synset_id = "wn:" + str(
                    synset.offset()).zfill(8) + synset.pos()
                sentenceNet.append(synset_id)
                if lemma not in dictionary:
                    dictionary[lemma] = [synset_id]
                elif synset_id not in dictionary[lemma]:
                    dictionary[lemma].append(synset_id)
            else:
                sentenceNet.append(lemma)

        elem.clear()

    if train:
        save(dictionary, '../resources/' + 'synsetsdic')
        flag = 'train'
    else:
        flag = 'dev'

    save(words, '../resources/' + 'words_' + flag)
    save(wordnet, '../resources/' + 'wordnet_' + flag)

    print('\nSentences removed:', deletedSentences)

    return words, wordnet
Exemplo n.º 28
0
def build_vocab_decoder_SemCor(threshold):

    # Create a vocab wrapper and add some special tokens.
    counter = Counter()
    target_file = open(
        "../../WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt",
        "r")

    # iterate through all definitions in the SemCor
    for line in target_file:

        # synset and literal definition from the WN
        key = line.replace('\n', '').split(' ')[-1]
        synset = wn.lemma_from_key(key).synset()
        definition = synset.definition()
        def_tokens = nltk.tokenize.word_tokenize(definition)
        counter.update(def_tokens)

    # add SemEval synsets
    semeval_file = open(
        "../../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt",
        "r")
    for line in semeval_file:
        key = line.replace('\n', '').split(' ')[-1]
        synset = wn.lemma_from_key(key).synset()
        definition = synset.definition()
        def_tokens = nltk.tokenize.word_tokenize(definition)
        counter.update(def_tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)

    print("Total vocabulary size: {}".format(vocab.idx))
    return vocab
def return_synset_id(key):
    '''
    Method used to return the wordnet synset_id from a sense_key of a specific word given in input.
    :param key the input sense_key
    :return the synset_id of the sense_key
    '''
    synset = wn.lemma_from_key(key).synset()
    synset_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos()
    return synset_id
Exemplo n.º 30
0
def wn_id_from_sense_key(sense_key):
    """
    Returns a WordNet ID built out of offset and POS from a given WordNet sense key.
    :param sense_key: WordNet sense key
    :return: WordNet ID corresponding to the given sense key
    """

    synset = wn.lemma_from_key(sense_key).synset()
    return wn_id_from_synset(synset)
Exemplo n.º 31
0
def semcor_to_offset(sensekey):
    """
    Converts SemCor sensekey IDs to synset offset.
    >>> print semcor_to_offset('live%2:42:06::')
    02614387-v
    """
    synset = wn.lemma_from_key(sensekey).synset
    offset = '%08d-%s' % (synset.offset, synset.pos)
    return offset
Exemplo n.º 32
0
def readAnswers(ifile="corpora/answers+misc/tasks/english-all-words/key"):
    answers = {}
    pattern = re.compile("(?P<id>d\S*)\s+(?P<sense>\S+:\S+)")
    for i in pattern.finditer(open(ifile).read()):
        try:
            answers[i.group("id")] = wn.lemma_from_key(i.group("sense"))
        except:
            pass
    return answers
Exemplo n.º 33
0
    def sk2lemma(self, sk, use_ws=False):
        try:
            lemma_name = wn.lemma_from_key(sk).name()
        except:
            lemma_name = sk.split('%')[0]

        if use_ws:
            lemma_name = lemma_name.replace('_', ' ')
        return lemma_name
def semcor_to_offset(sensekey):
    """
    Converts SemCor sensekey IDs to synset offset.
    >>> print semcor_to_offset('live%2:42:06::')
    02614387-v
    """
    synset = wn.lemma_from_key(sensekey).synset
    offset = '%08d-%s' % (synset.offset, synset.pos)
    return offset
Exemplo n.º 35
0
def get_rel_lemmas(lemma_str):
    ret_lemmas = []
    syn = wn.lemma_from_key(lemma_str).synset
    ret_lemmas = [i.name.lower() for i in syn.lemmas]
    hyns = set(get_all_hypernyms(syn,3))
    for h in hyns:
        ret_lemmas += [i.name.lower() for i in h.lemmas]
        ret_lemmas += [i.lower() for i in lemmatize(nltk.word_tokenize(h.definition))]
    return ret_lemmas
Exemplo n.º 36
0
def get_sense(word):
    try:
        synset = wn.lemma_from_key(word).synset().lemma_names()
    except Exception:
        return word
    for w in synset:
        if w != word:
            return w
    return word
Exemplo n.º 37
0
def semcor_to_synset(sensekey):
    """
    Look up a synset given the information from SemCor sensekey format.
    (Thanks for @FBond, see http://moin.delph-in.net/SemCor)
    >>> ss = semcor_to_offset('live%2:42:06::')
    >>> print '%08d-%s' % (ss.offset, ss.pos)
    >>> print ss, ss.definition
    02614387-v
    Synset('live.v.02') lead a certain kind of life; live in a certain style
    """
    return wn.lemma_from_key(sensekey).synset
Exemplo n.º 38
0
 def get_mfs(self, lemma, sensekeys = None):
     senseid = None
     try:
         if(not sensekeys):
             sensekeys = self.get_sense_keys(lemma)
         sense_freqs = {}
         for i in sensekeys:
             lemma_obj = wn.lemma_from_key(i)
             sense_freqs[i] = lemma_obj.count()
             
         senseid = max(sense_freqs.iteritems(), key=operator.itemgetter(1))[0]
     except(ValueError):
         print "Error: No senses for %s" % (lemma)
     return senseid
Exemplo n.º 39
0
 def semcor2synset(self, sense):
     return wn.lemma_from_key(sense).synset()
Exemplo n.º 40
0
def lemma_from_key(key):
	return wordnet.lemma_from_key(key)
Exemplo n.º 41
0
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError


with open("EnglishAW.test.key") as answer_file:
    answers = answer_file.read().split('\n')[:-1]
for answer in answers:
    answer = answer.split()
    word_id = answer[1]
    lemmas = answer[2:]
    for lemma in lemmas:
        try:
            if lemma != "U":
                synset = wn.lemma_from_key(lemma).synset
        except WordNetError:
            print("word id: {}      lemma: {}".format(word_id, lemma))
Exemplo n.º 42
0
def semcor_to_offset(sensekey):
 
    synset = wn.lemma_from_key(sensekey).synset
    offset = '%08d-%s' % (synset.offset, synset.pos)
    return offset
Exemplo n.º 43
0
    def slesk(self, wordforms):
        sensepreds = {}
        sw_hash = {i:1 for i in stopwords.words('english')}

        open_wfs = [i for i in wordforms if i.tag != 'punc' and not ('cmd' in i.attrib and i.attrib['cmd'] == "ignore")] # remove stop words and punctuation first
#        open_wfs = [i for i in wordforms if i.tag != 'punc'] # remove only punctuation

        for wf in wordforms:
            if('cmd' in wf.attrib and wf.attrib['cmd'] == 'done'):
                lemma = wf.attrib['lemma'] # may be need to automatically lemmatize here
                
                sensekeys = self.get_sense_keys(lemma)
                if(len(sensekeys) == 0):
                    print "Error: No senses for %s" % lemma
                    continue
                synsets = {k: wn.lemma_from_key(k).synset for k in sensekeys}

                idfs = text_utils.get_idfs()

                # get the window now
                window = 2
                idx = [i for i,x in enumerate(open_wfs) if x == wf][0]
                lbound = idx-window if idx-window > 0 else 0 
                ubound = idx+window if idx+window < len(open_wfs) else len(open_wfs)-1

#                all_context = set(text_utils.lemmatize(([i.text.lower() for i in open_wfs[lbound:(ubound+1)] if ('cmd' not in i.attrib or (i.attrib['cmd'] != "ignore" and i.attrib['id'] != wf.attrib['id']))]))) # this one keeps stopwords in window count
                all_context = set(text_utils.lemmatize(([i.text.lower() for i in open_wfs[lbound:(ubound+1)] if (i.attrib['id'] != wf.attrib['id'])]))) # this one keeps stopwords in window count


                jc_th = 0.1
                context = [i for i in all_context if text_utils.compute_jc_sim(i,lemma) > jc_th] # lexical chain selection algorithm

                outstr = "-------------------"
                outstr += "\ncontext: "+str(context)

#                best = self.get_mfs(lemma)
                max = 0
                cands = []

                for k in synsets.keys():
                    synset = synsets[k]
                    wntext = text_utils.lemmatize(nltk.word_tokenize(synset.definition))
                    for ex in synset.examples:
                        wntext += text_utils.lemmatize(nltk.word_tokenize(ex))
                        
#                    wntext += text_utils.lemmatize(text_utils.get_rel_lemmas(k)) # related lemmas from hypernyms etc.

                    wntext = [i.lower() for i in wntext]
                    lenlog = math.log(len(wntext))
                    normalizer = 1/ lenlog if lenlog > 0 else 0
                    outstr += "\n"+k+":"+str(wntext)
                    wn_hash = {i:1 for i in wntext}

                    matches = {}
                    score = 0
                    for i in context:
                        if(i in sw_hash): continue
                        if i in wntext:
                            score += 1
                            # if i in idfs:
                            #     score += idfs[i]
                            # else:
                            #     score += 3
                            matches[i] = 1 #idfs[i]
                    outstr += "\nScore: %s:%f" % (matches,score)
#                    score = score * normalizer
                    outstr += "\nNorm score: %s:%f" % (matches,score)
                    if score > max:
                        cands = [k]
                        max = score
                    elif score == max:
                        cands.append(k)

                if(len(cands) > 1):
                    best = self.get_mfs(lemma, cands)
                else:
                    best = cands[0]
            

                mfs_id = self.get_mfs(lemma)
                true_id = lemma+"%"+wf.attrib['lexsn']
                if mfs_id == true_id and best != mfs_id:
                    print "stat:leskbad"
                    print outstr
                    print "MFS: %s, LESK: %s, CORRECT: %s" % (self.get_mfs(lemma), best, wf.attrib['lexsn'])
                elif mfs_id != true_id and best == true_id:
                    print "stat:leskgood"
                    print outstr
                    print "MFS: %s, LESK: %s, CORRECT: %s" % (self.get_mfs(lemma), best, wf.attrib['lexsn'])
                elif max == 0:
                    print "stat:nolesk"
                else:
                    print "stat:lesksame"

                if(best):
                    m = re.match("^"+lemma+"\%(\d+:\d+:\d+:(.*))", best)
                    sensepreds[wf.attrib['id']] = m.group(1)
                else:
                    sensepreds[wf.attrib['id']] = "gibberish"
                
        return sensepreds
Exemplo n.º 44
0
def semcor_to_synset(sensekey):
 
    return wn.lemma_from_key(sensekey).synset
Exemplo n.º 45
0
def getSSFromLemma(l):
    return str(wordnet.lemma_from_key(l).synset().name())