def return_dim_sent(sent):
	""" Takes in a string sentence and checks if there are nouns in that sentence. If there are, it 
	returns the sentence with the noun in their diminutive form.

	:param sent: a string containing a sentence. 
	:return: a string containging the sentence with the nouns turned to diminutives.
	:rtype: str
	"""

	parsed = parse(sent, tokenize = True, tags = True, chunks = False)

	new_sent = []
	for word, pos in parsed.split()[0]:
	    if pos == 'NN' and not word.endswith('je'):    # If the word is a noun...
	        dim = dg.generate_diminutive(word)
	        
	        if new_sent[-1] == 'de': new_sent[-1] = 'het'	# correcting for article. Not perfect though.	
	        
	        new_sent.append(dim)
	    elif pos == 'NNS' and not word.endswith('jes'): # If the word is a noun in plural...
	        root = singularize(word)
	        dim = dg.generate_diminutive(root)
	        new_sent.append(dim + "s")
	    else:
	        new_sent.append(word)
        

	return " ".join(new_sent)
def return_dim_sent(sent):
    """ Takes in a string sentence and checks if there are nouns in that sentence. If there are, it 
	returns the sentence with the noun in their diminutive form.

	:param sent: a string containing a sentence. 
	:return: a string containging the sentence with the nouns turned to diminutives.
	:rtype: str
	"""

    parsed = parse(sent, tokenize=True, tags=True, chunks=False)

    new_sent = []
    for word, pos in parsed.split()[0]:
        if pos == 'NN' and not word.endswith('je'):  # If the word is a noun...
            dim = dg.generate_diminutive(word)

            if new_sent[-1] == 'de':
                new_sent[
                    -1] = 'het'  # correcting for article. Not perfect though.

            new_sent.append(dim)
        elif pos == 'NNS' and not word.endswith(
                'jes'):  # If the word is a noun in plural...
            root = singularize(word)
            dim = dg.generate_diminutive(root)
            new_sent.append(dim + "s")
        else:
            new_sent.append(word)

    return " ".join(new_sent)
Exemplo n.º 3
0
def wordvarieties(word):
    lem = lemma(word)
    pre = predicative(word)
    att = attributive(word)
    sin = singularize(word)
    con = conjugate(word, PRESENT, 1, SG)
    return [lem, pre, att, sin, con]
Exemplo n.º 4
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
         if nl.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.88)
     print("pattern.nl.singularize()")
Exemplo n.º 5
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
         if nl.singularize(pl) == sg:
             i +=1
         n += 1
     self.assertTrue(float(i) / n > 0.88)
     print "pattern.nl.singularize()"
Exemplo n.º 6
0
def run_lemmatization(entity_dict):
    lemma_dict = {}

    for ent in entity_dict:
        lemma_ent = singularize(ent)
        #lemma_ent = lemma(ent)
        if lemma_ent.replace(" ", "") == "":
            lemma_ent = ent
        if not lemma_ent in lemma_dict:
            lemma_dict[lemma_ent] = [ent]
        else:
            old = lemma_dict[lemma_ent]
            lemma_dict[lemma_ent] = old + [ent]

    return lemma_dict
Exemplo n.º 7
0
def run_lemma_lookup(lemma_dict, unmatched_entities, entity_dict):
    updated_dict = {}
    with open('lemmatized_overview.txt', 'w') as fh:

        for entity in unmatched_entities:
            if len(entity) >= 4:  #more error prone for smaller words
                lemma_entity = singularize(entity)
                #lemma_entity = lemma(entity)
                if lemma_entity in lemma_dict:
                    if len(lemma_dict[lemma_entity]) == 1:
                        entity_match = lemma_dict[lemma_entity][0]
                        cat = entity_dict[entity_match]
                        if len(entity_match) >= 4:
                            updated_dict[entity] = cat
                            fh.write(entity + "\t" + entity_match + "\t" +
                                     cat + "\n")

    fh.close()
    return updated_dict
Exemplo n.º 8
0
def prepare_text_nl(row):
    """ Prepares dutch text by doing the following:
    * Lemmatize a word
    * Singularize a word
    * Predicative a word
    
    Parameters:
    -----------
    row : pandas dataframe
        A row of a pandas dataframe
        
    Returns:
    --------
    new_message : pandas dataframe
        A row of a pandas dataframe 
    
    """
    try:
        message = split(parse(row.Message_Only_Text))
    except:
        print(row.Message_Only_Text)

    new_message = ''

    for sentence in message:
        for word, tag in sentence.tagged:
            if (tag == 'MD') | ('VB' in tag):
                new_message += lemma(word) + ' '
            elif tag == 'NNS':
                new_message += singularize(word) + ' '
            elif 'JJ' in tag:
                new_message += predicative(word) + ' '
            else:
                new_message += word + ' '

    return new_message
Exemplo n.º 9
0
def return_mods(words_found, path_to_db):
    """
    This functions should find the words and their modifier using 
    Ruben's terminology extractor. For now this function only works with 
    the first words found in WordNet by search_in_dwn

    :param words_found: list of words that are added to a xml pattern file.
    :type words_found: list
    :return container: a container of words and the output of the terminology etractor and 
    the word2vec model search of the words in words_found
    :rtype: dictionairy
    """
    top = Element('patterns')

    comment = Comment('Pattern file for terminology extractor')
    top.append(comment)
    #ALREADY SET-UP STORAGE FOR LATER USAGE
    container = {}
    for word in words_found:
        container[word] = defaultdict(
            list)  #INIT DEFAULTDICT TO STORE MODIFIERS
        child = SubElement(top, 'pattern', {'len': "2"})
        child.len = "2"
        ## ONLY SEARCHES FOR A N PATTERNS. IS THE REASON NOT ALL TERMS ARE FOUND AS ENTRY IN RETURNED DICT
        ## CAN ADD PATTERNS HERE
        SubElement(child, 'p', {"key": "pos", "position": "0", "values": "a"})
        SubElement(child, 'p', {
            "key": "tokens",
            "position": "1",
            "values": word
        })

    #STORE PATTERNS FILE
    if not os.path.isdir('patterns'):
        os.mkdir('patterns')

    logging.info("{} writing pattern file".format(time.strftime('%H:%M:%S')))
    file_name = os.path.abspath('.') + '/patterns/xml_pattern-{}.xml'.format(
        time.strftime('%d-%m-%y-%H:%M:%S'))
    with open(file_name, 'w', 0) as f:  #0 is for not buffering
        f.write(prettify(top).encode('utf8'))

    ## CALL THE TERMINOLOGY EXTRACTOR WITH THE NEWLY CREATED PATTERNS
    cmd = ' '.join(
        ['python', CMD_EXTRACTOR_SCRIPT, '-d', path_to_db, '-p', file_name])
    logging.info(cmd)
    logging.info("{} calling terminology extractor".format(
        time.strftime('%H:%M:%S')))
    process = Popen(cmd, stdout=PIPE, shell=True)
    output, err = process.communicate()
    ##STORE ALL THE TERMS AND THEIR MODIFIERS IN A DICTIONAIRY
    for term_element in [line.split() for line in output.split('\n') if line]:
        freq, mod, term = term_element
        # print freq, term, word
        try:
            container[term]['modifiers'].append((mod, freq))
        except KeyError:
            print >> sys.stderr, "not found in container: {}".format(term)

    for entry_term in container.keys():
        try:
            most_similar_words = model.most_similar(entry_term)
        except KeyError:
            print >> sys.stderr, "not found in model: {}".format(entry_term)
            continue
        singularized = [singularize(w) for w in zip(*most_similar_words)[0]]
        container[entry_term]['similar'].extend(singularized)
        if not entry_term in container[entry_term][similar]:
            #put search word in results.
            container[entry_term]['similar'].append(entry_term)
    return container