Пример #1
0
def normalize_word(word, words=None, stemmer=None, lemmatizer=None):
    '''normalize_word to its stem form,
    if not a dictionary word, return ''
    '''
    # initialize once
    global WORDS, STEMMER, LEMMATIZER
    if words is None:
        if WORDS is None:
            WORDS = wordnet.words()
        words = WORDS
    if stemmer is None:
        if STEMMER is None:
            STEMMER = LancasterStemmer()
        stemmer = STEMMER
    if lemmatizer is None:
        if LEMMATIZER is None:
            LEMMATIZER = WordNetLemmatizer()
        lemmatizer = LEMMATIZER

    if word in words:
        return word
    temp = lemmatizer.lemmatize(word)
    if temp in words:
        return temp
    temp = lemmatizer.lemmatize(word, pos='v')
    if temp in wordnet.words():
        return temp
    temp = stemmer.stem(word)
    if temp in wordnet.words():
        return temp
    return ''
Пример #2
0
def setup():
    try:
        all_words = wn.words()
        print("have wordnet, all is well")
    except LookupError:
        print("don't have wordnet, downloading")
        nltk.download('wordnet')
        all_words = wn.words()
    global WORDS
    WORDS = [w for w in all_words if w[0].isalpha() and "_" not in w]
Пример #3
0
 def lemmatize_if_needed(self, t):
     if t in wn.words() and wn.synsets(t)[0].pos() == 'n':
         logging.debug('found in wn : {}'.format(t))
         return t
     if t[-3:] == 'ies':
         t = t[:-3] + 'y'
     if t[-1:] == 's':
         t = t[:-1]
     if t in wn.words() and wn.synsets(t)[0].pos() == 'n':
         logging.debug('found in wn : {}'.format(t))
         return t
     return None
def exercise4():
    # The majority of WordNet's senses are marked by four POS categories: noun, verb, adjective, and adverb.
    # Determine the percentage of words from the WordNet corpus that have senses in more than one of these categories.
    # For example, type has senses which connect to both "noun" and "verb" POS (positive case),
    # whereas typewriter has only senses which connect to "noun" POS (negative case)

    # get all words in WordNet
    wn_words = [w for w in wn.words()]  #list of words
    wn_text = nltk.Text(word.lower() for word in wn_words)  # plain text
    #print wn_text
    pos_tags = nltk.pos_tag(wn_text)  # tag of word (word, tag)

    # convert to WordNet's senses
    wn_pos_tags = []
    for pos_tag in pos_tags:
        if pos_tag[1].startswith('J'):
            wn_pos_tags.append((pos_tag[0], 'ADJ'))
        if pos_tag[1].startswith('V'):
            wn_pos_tags.append((pos_tag[0], 'VERB'))
        if pos_tag[1].startswith('N'):
            wn_pos_tags.append((pos_tag[0], 'NOUN'))
        if pos_tag[1].startswith('R'):
            wn_pos_tags.append((pos_tag[0], 'ADV'))

    count = 0
    data = nltk.ConditionalFreqDist((word, tag) for (word, tag) in wn_pos_tags)
    for word in data.conditions():
        if len(data[word]) > 1:  # more than one sense
            count = count + 1
            tags = data[word].keys()
            print word, ' '.join(tags)

    print("The percentage of words that have more than one senses",
          count / len(wn_pos_tags))
Пример #5
0
def main(args):

    target = wn.synset(args.target)
    print('target:', args.target)

    words = wn.words()

    nouns = set([])
    for word in words:
        nouns.update(wn.synsets(word, pos='n'))

    print(len(nouns), 'nouns')

    hypernyms = []
    for noun in nouns:
        paths = noun.hypernym_paths()
        for path in paths:
            try:
                pos = path.index(target)
                for i in range(pos, len(path) - 1):
                    hypernyms.append((noun, path[i]))
            except Exception:
                continue

    hypernyms = list(set(hypernyms))
    print(len(hypernyms), 'hypernyms')

    if not args.shuffle:
        random.shuffle(hypernyms)
    with open(args.result_file, 'w') as fout:
        for n1, n2 in hypernyms:
            print(n1.name(), n2.name(), sep=args.sep, file=fout)
Пример #6
0
def checkgibberish_words(userInput):
    '''
    This function checks if all the words of the input sentence is a valid English word.
    '''
    #Remove punctuation

    #make translator object
    translator=str.maketrans('','',string.punctuation)
    userInput=userInput.translate(translator)

    #Remove digits from the input
    userInput_NoDigits = re.sub(r'\d+', '', userInput)
    wnl = WordNetLemmatizer()

    gibberishWord = False
    for word in userInput_NoDigits.split():
        chkWord=word in words.words()
        if not chkWord:
                chkWordNet = word in wordnet.words()
                if not chkWordNet:
                     lemma = wnl.lemmatize(word, 'n')
                     #WordNet and Word dictionary does not include plural words,
                     #hence checking for plural words through WordNetLemmatizer
                     plural = True if word is not lemma else False
                     if not plural:
                        gibberishWord = True

    return gibberishWord
Пример #7
0
def getWords():
    str = textSearch.get()
    sLen = searchLen.get()
    if sLen:
        sLen = int(sLen)
    else:
        sLen = 0

    listWords = []

    compRegx = re.compile('^[{}]+$'.format(str))

    for w in wordnet.words():
        match = compRegx.search(w)
        if match and compRepeat(str, w):
            if sLen == 0:
                listWords.append(match.group())
            elif len(match.group()) == sLen:
                listWords.append(match.group())

    listBox1.delete(0, listBox1.size())

    i = 1
    for w in listWords:
        listBox1.insert(i, "->  {}".format(w))
        i += 1
    listBox1.pack()
def build_word_data():
    print('***** Build words.\n')
    words = []
    for word in wn.words():
        words.append(word)
    max_length_word = get_longest_item(words)
    dump_data_to_pickle(WORDS_PATHS['pkl'], words, max_length_word)
Пример #9
0
def gen_signature(word):
    """Generate a signature for each candidate expansion, using contextual
       information from the Brown corpus, as well as WordNet definitions and
       examples (if applicable)."""
    if word in gen_signature.dict:
        return gen_signature.dict[word]
    inds = find_matches(word)
    if len(inds) > 50:
        f = len(inds) / 50
        inds = [inds[int(i * f)] for i in range(50)]
    signature = defaultdict(int)
    for i in inds:
        for w in gen_context(i, brown):
            signature[w] += 1
    sig = {w for w in signature
           if signature[w] > 1
           and w not in stopwords.words('english') and w != ','}
    if word in wn.words():
        if wn.synsets(word) and str(wn.synsets(word)[0]).count("'") == 2:
            define = (eval("wn.{}.definition()".format(
                      str(wn.synsets(word)[0]).lower())))
            examples = (eval("wn.{}.examples()".format(
                        str(wn.synsets(word)[0]).lower())))
            if examples:
                for ex in examples:
                        sig.update([w for w in wt(ex)
                                   if w not in stopwords.words('english')])
            if define:
                        sig.update([w for w in wt(define)
                                   if w not in stopwords.words('english')])
    gen_signature.dict[word] = sig
    return sig
Пример #10
0
 def get_words(self, n=500):
     '''
          Method to get all the words from wordnet
          and return them as a list
       '''
     # used set to make sure the words are unique
     return list(set(list(wn.words())))[:n]
Пример #11
0
def generate_network(word, network=defaultdict(set)):
    print("building network for word '%s' subtree..." % word)
    logger.info("building network for word '%s' subtree..." % word)
    words, target = wn.words(), wn.synset('%s.n.01' % word)
    targets = set(open('data/%s_dependencies.txt' % word).read().split('\n'))
    nouns = {
        noun
        for word in words for noun in wn.synsets(word, pos='n')
        if noun.name() in targets
    }
    for noun in nouns:
        for path in noun.hypernym_paths():
            if target not in path:
                continue
            for i in range(path.index(target), len(path) - 1):
                if not path[i].name() in targets:
                    continue
                network[noun.name()].add(path[i].name())
    with open('data/%s_network.csv' % word, 'w') as out:
        nb_vertex = len(network)
        for key, vals in network.items():
            for val in vals:
                out.write(key.split('.')[0] + ',' + val.split('.')[0] + '\n')
    nb_links = len(pd.read_csv('data/%s_network.csv' % word))
    print('Builded network of %s vertexes and %s links for word %s' %
          (nb_vertex, nb_links, word))
Пример #12
0
def analyse_definition():
    from nltk.corpus import wordnet as wn

    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    all_words = set(_ for _ in wn.words())
    all_defs = set()

    with progressbar.ProgressBar(max_value=len(list(all_words))) as bar:
        for idx, word in enumerate(all_words):
            synsets = wn.synsets(word)
            definitions = ' '.join([_.definition() for _ in synsets])
            definitions = definitions.replace(";", "")
            defwords = [
                defw for defw in definitions.split(' ') if defw in all_words
            ]
            defwords += [
                lemmatizer.lemmatize(defw) for defw in definitions.split(' ')
                if defw not in all_words
                and lemmatizer.lemmatize(defw) in all_words
            ]
            all_defs.update(defwords)
            bar.update(idx)

    print("Full voc:", len(list(all_words)))
    print("Full def:", len(list(all_defs)))
Пример #13
0
 def __init__(self):
     words = list(set(i for i in wn.words()))
     counter = Counter()
     self.max_len = 0
     for word in words:
         counter.update([word])
         word = wn.synsets(word)
         for meaning in word:
             definition = re.sub(r'\([^)]*\)', '', meaning.definition())
             if len(definition) == 0:
                 continue
             if definition[0] == ' ':
                 definition = definition[1:]
             self.max_len = max(self.max_len, len(definition.split(' ')))
             counter.update(definition.split(' '))
     self.vocab = Vocab(counter,
                        specials=('<unk>', '<pad>', '<sos>', '<eos>'))
     self.vocab_len = len(self.vocab)
     self.meanings = []
     out_counter = Counter()
     for word in words:
         if counter[word] > 3:
             out_counter.update([word])
             self.meanings.extend([(word, i.definition())
                                   for i in wn.synsets(word)])
     self.out_vocab = Vocab(out_counter,
                            specials=('<unk>', '<pad>', '<sos>', '<eos>'))
     self.out_vocab_len = len(self.out_vocab)
Пример #14
0
 def __init__(self):
     self.words = list(set(i for i in wn.words()))
     counter = Counter()
     for word in self.words:
         counter.update([word])
         word = wn.synsets(word)
         for meaning in word:
             counter.update(meaning.definition().split(' '))
     self.vocab = Vocab(counter)
def get_n_length_words(n):
    words = list(wn.words())
    nLength = set()
    for word in words:
        i = count_underscore(word)
        if len(word) - i == n:
            word = word.replace('_', '')
            nLength.add(word.upper())
    return sorted(nLength)
Пример #16
0
def is_english_word(word):
    """
    This function is used to set the the key AKA the mono alphabetic list
    Return : array of integers
    """
    setofnetwords = set(wordnet.words())
    if word in setofnetwords:
        return True
    else:
        return False
Пример #17
0
 def eng(self):
     words = list(set(i for i in wn.words()))
     out_words = []
     for word in words:
         meanings = wn.synsets(word)
         word = word.replace('_', ' ')
         for meaning in meanings:
             out_words.append((word, meaning.definition()))
     indices = list(range(len(out_words)))
     return out_words, indices
Пример #18
0
	def __init__(self):
		self.tag = random.sample(list(wordnet.words()),1)[0]
		self.intensity = random.random()
		self.sentiment = random.random()*2 - 1
		self.lifetimesecs = random.random() * (EvolvingRandomTag.MAX_LIFETIME-1) + 1
		self.ttl = self.lifetimesecs
		self.t_start = time.time()
		self.t_last = self.t_start
		self.intensitystep = EvolvingRandomTag.MAX_STEP_INTENSITY  # max step per second
		self.sentimentstep = EvolvingRandomTag.MAX_STEP_SENTIMENT
Пример #19
0
    def construct_wn_thesauri(self, thesauri_dir):
        """
        1. T <--- extract all wordnet words
        2. for a word in T, find its wordnet synonyms and antonyms, save them in two sets
        3. for all words and their corresponding sets, save them in two dicts
        4. save two dicts as npy files
        if a word doesn't appear in the dict, it will return an empty set
        :param thesauri_dir: pos to save syn_dict and ant_dict
        :return: None
        """

        # key: word, value: syn_words(set)
        syn_dict = defaultdict(set)
        # key: word, value: ant_words(set)
        ant_dict = defaultdict(set)

        # get synonyms from wordnet
        # TODO: configure this in the future
        # since a word has multiple meanings we combine all the synonyms as its synonyms
        # this can be configured in the future
        # and all the antonyms are its antonyms
        for word in tqdm(wn.words()):
            synonyms, antonyms = [], []
            for syn in wn.synsets(word):
                for lemma in syn.lemmas():
                    synonyms.append(lemma.name())
                    antonyms.append(
                        [ant_lemma.name() for ant_lemma in lemma.antonyms()])
            # add word's synonyms
            syn_dict[word].update(synonyms)
            # remove itself in syn set
            if word in syn_dict[word]:
                syn_dict[word].remove(word)
            # add word's antonyms
            ant_dict[word].update(chain.from_iterable(antonyms))
            # remove itself in ant set
            if word in ant_dict[word]:
                ant_dict[word].remove(word)

            # since a word t may have multiple meanings, a same word w could be its synonym and antonym
            # for this case, remove w in t's synonym
            syn_dict[word] = syn_dict[word] - ant_dict[word]

            # CAUTION: original wordnet is asymmetric on synonyms and antonyms
            # make syn_dict symmetric
            for synonym in syn_dict[word]:
                syn_dict[synonym].add(word)
            # make ant_dict symmetric
            for antonym in ant_dict[word]:
                ant_dict[antonym].add(word)

        # save synonym/antonym dict
        np.save(join(thesauri_dir, 'syn_dict.npy'), syn_dict)
        np.save(join(thesauri_dir, 'ant_dict.npy'), ant_dict)
def gen_data(network=defaultdict(set)):
	words, target = wn.words(), wn.synset('mammal.n.01')
	targets = set(open('data/targets.txt').read().split('\n'))
	nouns = {noun for word in words for noun in wn.synsets(word,pos='n') if noun.name() in targets}
	for noun in nouns:
		for path in noun.hypernym_paths():
			if not target in path: continue
			for i in range(path.index(target),len(path)-1):
				if not path[i].name() in targets: continue
				network[noun.name()].add(path[i].name())
	with open('data/mammal_subtree.tsv','w') as out:
		for key,vals in network.iteritems():
			for val in vals: out.write(key+'\t'+val+'\n')
Пример #21
0
 def crawl_wordnet(self, corenlp_url):
     corenlp = StanfordCoreNLP(corenlp_url)
     for i, word in enumerate(wordnet.words()):
         if word in self._data:
             logger.info('skip a known word {}'.format(word))
             continue
         self._data[word] = []
         for synset in wordnet.synsets(word):
             def_ = corenlp.tokenize(synset.definition())[0]
             self._data[word].append(def_)
         if i % 10000 == 0:
             self.save()
     self.save()
Пример #22
0
 def structure_to_sdr(self, synset_structure_with_idx):
     con = sqlite3.connect("data/words_sdr.db")
     con.execute("CREATE TABLE sdrtable (word, sdr)")
     sql = ''' INSERT INTO sdrtable(word, sdr)
               VALUES(?,?) '''
     for word in wn.words():
         cursor = con.cursor()
         sdr = ' '.join(
             str(idx)
             for idx in self.word_to_sdr(word, synset_structure_with_idx))
         cursor.execute(sql, (word, sdr))
     con.commit()
     con.close()
Пример #23
0
def get_prepared_words() -> List[str]:
    # init
    nltk.download('words')
    nltk.download('stopwords')
    nltk.download('wordnet')

    # prepare
    prepared_words = []
    r = requests.get(
        'https://raw.githubusercontent.com/dwyl/english-words/master/words_dictionary.json'
    )
    dwyl_words = list(r.json().keys())
    ahmadly_words = list(
        pickle.load(
            urlopen(
                "https://github.com/jojoee/WordCookiesCheat/blob/master/all_words.pickle?raw=true"
            )))
    wordnet_words = list(wordnet.words())

    # proceed
    prepared_words = stopwords.words() + words.words(
    ) + wordnet_words + ahmadly_words
    prepared_words = [
        word.lower() for word in prepared_words if word.isalpha()
    ]
    prepared_words = list(set(prepared_words))

    # sort
    prepared_words.sort()

    # debug
    print("stopwords size: %d" % len(stopwords.words()))
    print("words size: %d" % len(words.words()))
    print("wordnet size: %d" % len(list(wordnet.words())))
    print("dwyl_words size: %d" % len(dwyl_words))
    print("prepared_words size: %d" % len(prepared_words))

    return prepared_words
Пример #24
0
 def _scores_over_sample(self):
     words = set(wn.words())
     sample = set(random.sample(words, self.sample_size))
     for word1 in sample:
         for word2 in (sample - set([word1])):
             self.compute_overlap(word1, word2)
             self.path_similarity(word1, word2)
             # self.lch_similarity(word1, word2)
             self.wup_similarity(word1, word2)
             # self.res_similarity(word1, word2)
             # self.jcn_similarity(word1, word2)
             # self.lin_similarity(word1, word2)
             print(f"{word1} and {word2} done")
         sample = sample - set([word1])
Пример #25
0
    def __init__(self, in_terms=[], threshold=0.025, allow_multiple=False, rseed=1, num_rands=10):
        self.threshold = threshold

        self.synset_counts = {}
        self.synset_set = set()
        self.preceding_words = set([''])
        self.succeeding_words = set([''])
        self.matched_terms = set()
        self.skip_set = set()
        self.max_length = 0
        self.allow_multiple = allow_multiple

        if rseed > 0:
            random.seed(rseed)

        words = []
        for i in range(num_rands):
            word = random.sample(list(wn.words()), 1)[0]
            while wn.synsets(word)[0].pos() != 'n':
                word = random.sample(list(wn.words()), 1)[0]
            print('word {}'.format(word))
            words += [word]

        self.get_synsets_recursively(wn.synsets(words[0]))
        self.overfrequent_synsets = self.skip_set.copy()
        print('overfrequent synsets : {}'.format(self.overfrequent_synsets))
        for word in words[1:]:
            self.skip_set = set()
            self.get_synsets_recursively(wn.synsets(word))
            print('skip_set synsets : {}'.format(self.skip_set))
            self.overfrequent_synsets &= self.skip_set
            print('overfrequent synsets : {}'.format(self.overfrequent_synsets))
        
        for t in in_terms:
            if not self.add_training_term(t):
                logging.debug('no wn vocab found :: {}'.format(t))
        self.make_set()
Пример #26
0
def findWordNetRhymes(word):
    w = d(word.lower())[0]

    res = []
    for ss in wn.words():
        if d(ss)[0] == w:
            res.append(ss)

    if len(res) < 1:
        word = word[1:]
        if len(word) > 1:
            print("Going one level further")
            return findWordNetRhymes(word)
    else:
        return res
Пример #27
0
def main(result_file, shuffle, sep):

    words = wn.words()
    nouns = set([])
    for word in words:
        nouns.update(wn.synsets(word, pos='n'))

    print( len(nouns), 'nouns')

    hypernyms = list(transitive_closure(nouns))
    print( len(hypernyms), 'hypernyms' )
    if not shuffle:
        random.shuffle(hypernyms)
    with open(result_file, 'w') as fout:
        for n1, n2 in hypernyms:
            print(n1.name(), n2.name(), sep=sep, file=fout)
Пример #28
0
    def __init__(self):
        self.nlp = English()
        self.model = Word2Vec.load_word2vec_format(self.model_file,
                                                   binary=True)

        self.new_words.extend(list(self.extra_words.values()))
        self.sw = set(stopwords.words('english'))
        self.sw.update(webcolors.CSS3_NAMES_TO_HEX)  # remove colors
        self.sw.update(self.lorem)
        self.sw.update(self.extra_stopwords)
        for w in self.non_stopwords:
            self.sw.discard(w)
        self.english_vocab = set(w.lower() for w in words.words('en'))
        self.english_vocab.update(wordnet.words('eng'))
        self.english_vocab.update(self.new_words)
        self.word_set = set(self.model.index2word)
        self.num_features = 300
Пример #29
0
def create_Wordnet_set():
    '''The purpose of this function is to create a set of all words from the wordnet dictionary.
    Input  = None
    Output = Set object of all words. 
    '''
    # Import words from wordnet
    from nltk.corpus import wordnet as wn
    Words = wn.words()

    # Create List to capture words
    List_dict_words = []
    [List_dict_words.append(x) for x in Words]

    # Create Set
    Set_dict_words = set(List_dict_words)

    # Return Set
    return Set_dict_words
    def get_wordnet_subset(self):
        '''
        Getting the subset of wordnet for a specific language.
        Using the fastText model, this function associate to each synonym
        a similarity score with the processed one.
        A dictionary is return with language word as key and the corresponding
        synonym dict list as value.
        Each dictionary in the list has as key the synonym itself and the similarity is the value
        :return:
        '''
        # Loading of the model
        self.emb_model = self.load_fastText_model()

        words = [word.replace('_', ' ') for word in wn.words(lang=self.LANG)]

        syns = {}
        for w in words:
            syns[w] = [
                synset.lemma_names(self.LANG)
                for synset in wn.synsets(w, lang=self.LANG)
            ]

        synonyms = {}

        for word in syns.keys():
            syn_list = {}
            for synset in syns[word]:
                for word_syn in synset:
                    if word.lower() == word_syn.lower():
                        continue

                    if self.word_in_syn_list(syn_list, word_syn):
                        continue

                    if math.isnan(
                            float(self.get_word_similarity(word, word_syn))):
                        continue

                    syn_list[word_syn.replace('_', ' ')] = float(
                        self.get_word_similarity(word, word_syn))

            synonyms[word] = syn_list

        return synonyms
Пример #31
0
def train(tests):

	#files = os.listdir(c_dir)
	#print files
	#for f in files:
	c_dir = "/home/such/Documents/NLP/Programming_Assignment_resources/"
	fpath = c_dir+'big.txt'
        #stemmer  = WordNetStemmer()
        #stem = lambda x:stemmer.stem(x)
	
	stops = stopwords.words('english')
	doc = open(fpath,'r')
	words = re.findall('[a-z]+' , doc.read().lower())	
	#words2 = [ w for w in words if w not in stops]
	#define n-gram model from the file stored using pickel
	
	#corp = pickle.load(open('corpfile'))
	corp = map(lambda x :x.lower(), brown.words()) 
	ispresent = lambda x : wn.words(x) != []
	#corp = filter(ispresent, corp)
	#corp_words = re.findall('[a-z]+' , corp)
	#corp = [ w for w in corp if w not in stops]
	corp_dict = offset_dict(corp)
	for s in tests:
		print s
		test_words = re.findall('[a-z]+' , s.lower())
		# Finding the possible misspelled words
		misspelled = []
		sentences = s.split('.')
		#mispos = {}
		#sentences = [s for s in sentences if s not in stops]
		for t_word in test_words :
                    if t_word.lower() not in words and t_word.lower() != '':
					misspelled.append(t_word.lower())
					#mispos[w.lower()] = s.index(w)
		#print mispos
			
		#finding the candidate words for words in the misspelled array
		candidates = {}
		for wrong in misspelled:
			#pos = s.index(wrong)
			candidates[wrong] = (list(edit_distances.correct(wrong)))
				
		#find the context words for the test sentences and the corpus
		corrections = {}
                for miss in misspelled:
			print test_words
			#find the context words for the mispelled words
			error_dict = offset_dict(test_words)
			error_context = list(set(concord(error_dict,test_words,miss)))
			error_context = [e for e in error_context if e not in stops]
			errcont = []
			for errc in error_context:
				errcont += list(set(concord(corp_dict,corp,errc)))
			errcont = filter(ispresent,errcont)
			errcont = [e for e in errcont if e not in stops]
                        errcont += error_context
			#
			#print "error context"
			#print error_context

			#print errcont
			#for each context word find how often they co-occur with each of the corrections
			counts = {}
			can_list = candidates[miss]
			#print can_list
			for c in can_list:
				cand_cooccur = list(set(concord(corp_dict,corp,c)))   #change the corpus here
				#cand_cooccur = filter(lambda x: edit_dist(c,miss) < 2, cand_cooccur)
				cand_cooccur = filter(ispresent,cand_cooccur)
				cand_cooccur = [ca for ca in cand_cooccur if ca not in stops]
				#print "printing candidate context for" + c +".....................\n\n\n\n"
				#print "candidate contexts for "+c
				#print cand_cooccur
				count = sum([cand_cooccur.count(i) for i in errcont])
				counts[c] = count,sim(errcont,c)
		
                        print counts
			corrections[miss] = max(counts,key = lambda a:counts.get(a))
			p = test_words.index(miss)
                        test_words[p] = max(counts,key = lambda a:counts.get(a))
                       
	
		#Suggest the corrections
		
			
			print "misspelled :" + miss +"\n"
			try:
				print "correction :"  + corrections[miss] + "\n\n"
			except ValueError:
				pass
def train(words):
    tagging_model = dict()
    training_model = dict()
    for word,tag in words:
        try:
            tagging_model[word].add(tag)
        except:
            tagging_model[word] = set([tag])
        try:
            training_model[word] += 1
        except:
            training_model[word] = 1
    return training_model, tagging_model

known_good_words = list(wordnet.words())
known_words, known_tagged_words = train(brown.tagged_words())

def edit_distance_one(word):
   set_of_words = set()
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]

   inserts    = set([a + c + b for a, b in splits for c in alphabet])
   set_of_words = set_of_words.union(inserts)

   deletes    = set([a + b[1:] for a, b in splits if b!=[]])
   set_of_words = set_of_words.union(deletes)

   trans      = set([a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1])
   set_of_words = set_of_words.union(trans)
Пример #33
0
                        matrix.append([net_words.index(str(word)),net_words.index(ant_word),-1])
                        matrix.append([net_words.index(ant_word),net_words.index(str(word)),-1])
                        

 
    
if __name__ == "__main__":
    
    result_word=open('/Users/Jane/Documents/Python/result_word.txt','w')
    result_matrix=open('/Users/Jane/Documents/Python/result_matrix.txt','w')
    net_words=[]
    synsets=[]
    antonyms=[]
    counter=0
    matrix=[]
    word_list=list(wn.words())
    random.shuffle(word_list)
    for word in word_list:
        if(counter<25000):
            get_synsets(word,synsets,net_words)
            counter+=1
    for item in net_words:
        result_word.write(item)
        result_word.write('\n')
    result_word.close()
    for item in matrix:
        result_matrix.writelines(str(item))
        result_matrix.write('\n')
    result_matrix.close()
    print len(net_words)