예제 #1
0
def lemma_via_patternlib(token, pos):
    if pos == 'NP':  # singularize noun
        return singularize(token)
    elif pos.startswith('V'):  # get infinitive of verb
        return conjugate(token)
    elif pos.startswith('ADJ') or pos.startswith('ADV'):  # get baseform of adjective or adverb
        return predicative(token)

    return token
예제 #2
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
         if tag == "n":
             if de.singularize(pl) == sg:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.82)
     print("pattern.de.singularize()")
예제 #3
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")):
         if tag == "n":
             if de.singularize(pl) == sg:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.81)
     print "pattern.de.singularize()"
예제 #4
0
def _getSingularize(word, language):
    import pattern.en as pattern_en  # @UnresolvedImport
    import pattern.es as pattern_es  # @UnresolvedImport
    import pattern.fr as pattern_fr  # @UnresolvedImport
    import pattern.de as pattern_de  # @UnresolvedImport
    import pattern.it as pattern_it  # @UnresolvedImport

    if language == "es":
        return pattern_es.singularize(word)
    elif language == "en":
        return pattern_en.singularize(word)
    elif language == "it":
        return pattern_it.singularize(word)
    elif language == "fr":
        return pattern_fr.singularize(word)
    elif language == "de":
        return pattern_de.singularize(word)
    else:
        return pattern_en.singularize(word)
예제 #5
0
                if "no result for" in output:
                        knownWord=0
        ## Feature 4: no free pronouns (Stanford) yes(1)/no(0)
        ## go through sentence and see if there are tags for pronouns 
                wordsPOS0=sentTagged[j]
                wordsPOS=wordsPOS0[1]
                if wordsPOS=="PPOS" or wordsPOS=="PDS" or wordsPOS=="PRELS" or wordsPOS=="PWS" or wordsPOS=="PIS" or wordsPOS=="PRF" or wordsPOS=="PPER":
                        pronouns=1
                elif wordsPOS=="ADJA" or wordsPOS=="ADJD":
                        adj=adj+1

        ## Feature 7: word frequencies (look up frequencies in DeReWo)
                rf=0
                countInList=0
                word2 = wordsLowWOPunc[j]
                word2=singularize(word2.decode("utf-8")).lower()
                for i in range(0,len(freqWo)):
                        if freqWo[i].decode("utf-8")== word2:
                                freqNo[i]=rf
                                countInList=countInList+1
                                if rf > 14:
                                        rareFreq=rareFreq+1
                                        break
                ## if the word is not in the list it must be rare therefore increase count
                if countInList==0:
                        rareFreq=rareFreq+1

        ## Feat. 8: mean word length
                wordLen=len(word2)
                wordLenMean=wordLen+wordLenMean
예제 #6
0
dict_name_nn = {}
dict_name_ne = {}

with open('testfile.txt', 'r') as openfile:
    read_text = openfile.read()
    parsetree_text = parsetree(read_text)
    # pprint(parsetree_text)
    # read_text = read_text.rstrip('\n')
    # print(re.findall(r'[\w]+|[.,!?;]', read_text))
    # pprint(parse(read_text, tags=True, chunks=True, relations=True,
    #              lemmata=True, encoding='utf-8', tagset='STTS'))
    for word, pos in tag(read_text, tagset='STTS'):
        if pos == 'NN':

            singularForm = singularize(word)

            if word == singularForm:
                pass
                # plural = True
                # print word + '\t' + singularForm + '\t' + str(plural)
            else:
                word = singularForm
                # plural = False
                # print word + '\t' + singularForm + '\t' + str(plural)

            if word not in dict_name_nn.keys():
                dict_name_nn[word] = 1
            elif word in dict_name_nn.keys():
                dict_name_nn[word] += 1
            else:
예제 #7
0
import codecs

# not used, but possibly interesting http://www.nltk.org/

# http://www.clips.ua.ac.be/pages/pattern-de
from pattern.de import lemma, tag, predicative, singularize

# possible parts of speech:
# PRP$, FW, VBN, WDT, JJ, WP, DT, RP, NN, TO, PRP,
# RB, NNS, NNP, VB, WRB, CC, LS, CD, IN, MD, UH
part_of_speech_command = {
    'PRP$': lambda word: predicative(word),  # pronomina
    'VBN': lambda word: lemma(word),  # verba
    'DT': lambda word: predicative(word),  # pronomina
    'VB': lambda word: lemma(word),  # verba
    'NN': lambda word: singularize(word),  # nomina
    'JJ': lambda word: predicative(word)  # preposice
}

pattern_word = re.compile('[a-zA-Z]')
pattern_punctuation = re.compile(ur'[—\-|«»…–<>]')


def transform(tagword):
    word = tagword[0]
    part = tagword[1]
    # if part == 'VBN':
    #     print tagword

    # word must contain some letters
    if not bool(pattern_word.match(tagword[0])):
예제 #8
0
def insertTagsToParsedDB(lastID, lastTitel, lastText):
    """Function to process the input, POS-tag it and write it in the
    DB dbparsedText."""
    parsedDatabase = SQLFORM(db.dbParsedText)
    inputDatabase = SQLFORM(db.dbInput)

    dictNN = {}
    dictNE = {}

    # SQL Query to extract ID, Title and Text
    extractQueryInputDB = db.executesql('select id, inputTitle, inputText\
                                        from dbInput')
    lastText = extractQueryInputDB[-1][-1]

    # Begin of For-Loop for POS-Tagging
    for word, postag in tag(lastText, tagset='STTS'):
        word = word.decode('utf-8')

        if postag == 'NN':

            singularFormNN = singularize(word)

            if word == singularFormNN:
                pass
            else:
                word = singularFormNN

            if word not in dictNN.keys():
                dictNN[word] = 1
            elif word in dictNN.keys():
                dictNN[word] += 1
            else:
                pass

        elif postag == 'NE':

            singularFormNE = singularize(word)

            if word == singularFormNE:
                pass
            else:
                word = singularFormNE

            if word not in dictNE.keys():
                dictNE[word] = 1
            elif word in dictNE.keys():
                dictNE[word] += 1
            else:
                pass
        else:
            pass

    listNN = dictNN.items()
    listNE = dictNE.items()

    # for key, value in dict710WC.iteritems():
    #     print key
    # print dict710WC

    # print 'Letzte ID: ' + str(lastID)
    # print 'Letzter Titel: ' + str(lastTitel)
    # print 'Letzter Text: ' + lastText
    # print '\n\n'
    # print dictNE
    # print '\n'
    # print dictNN
    # return extractQueryInputDB
    # return locals()
    return dictNE, dictNN
예제 #9
0
def buildWordList():
    """
    Function to build lists and dictionaries.

    Function exports txt-files incl. dictionaries and lists from both DDC for
    the words that are tagged as NE and NN.
    """
    # Defining list variables
    list330NE = []
    list330NN = []
    list330 = []
    list710NE = []
    list710NN = []
    list710 = []

    # Defining dictionary variables
    dict330NE = {}
    dict330NN = {}
    dict330 = {}
    dict330WithoutCommons = {}
    dict710NE = {}
    dict710NN = {}
    dict710 = {}
    dict710WithoutCommons = {}

    for dirpath, dirs, files in os.walk('../../collecting/temp/'):
        for filename in fnmatch.filter(files, '*.txt'):
            with open('../../collecting/temp/' + dirpath + '/' + filename,
                      'r') as openfile:

                parsefile = openfile.read()
                # parsefile = parse(parsefile)

                ddcFromFilepath = dirpath[-3:]

                for word, postag in tag(parsefile, tagset='STTS'):
                    word = word.decode('utf-8')

                    if ddcFromFilepath == '330':
                        if postag == 'NN':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list330NN.append(word)
                            list330.append(word)

                            if word not in dict330NN.keys():
                                dict330NN[word] = 1
                            elif word in dict330NN.keys():
                                dict330NN[word] += 1
                            else:
                                pass

                        elif postag == 'NE':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list330NE.append(word)
                            list330.append(word)

                            if word not in dict330NE.keys():
                                dict330NE[word] = 1
                            elif word in dict330NE.keys():
                                dict330NE[word] += 1
                            else:
                                pass
                        else:
                            pass

                    elif ddcFromFilepath == '710':
                        if postag == 'NN':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list710NN.append(word)
                            list710.append(word)

                            if word not in dict710NN.keys():
                                dict710NN[word] = 1
                            elif word in dict710NN.keys():
                                dict710NN[word] += 1
                            else:
                                pass

                        elif postag == 'NE':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list710NE.append(word)
                            list710.append(word)

                            if word not in dict710NE.keys():
                                dict710NE[word] = 1
                            elif word in dict710NE.keys():
                                dict710NE[word] += 1
                            else:
                                pass
                        else:
                            pass

                    else:
                        pass

    # Building list with words common in both DDCs
    listCommonWords = list(set(list330).intersection(list710))

    # Building new lists without the common words
    list330WithoutCommons = []
    list710WithoutCommons = []

    for i in list330:
        if i not in listCommonWords:
            list330WithoutCommons.append(i)

    for i in list710:
        if i not in listCommonWords:
            list710WithoutCommons.append(i)

    # Building new dictionaries without the common words
    for i in list330WithoutCommons:
        if i not in dict330WithoutCommons.keys():
            dict330WithoutCommons[i] = 1
        elif i in dict330WithoutCommons.keys():
            dict330WithoutCommons[i] += 1
        else:
            pass

    for i in list710WithoutCommons:
        if i not in dict710WithoutCommons.keys():
            dict710WithoutCommons[i] = 1
        elif i in dict710WithoutCommons.keys():
            dict710WithoutCommons[i] += 1
        else:
            pass

    # Merge NE and NN dictionaries into one dictionary
    dict330.update(dict330NE)
    dict330.update(dict330NN)
    dict710.update(dict710NE)
    dict710.update(dict710NN)

    # Dump dictionaries into JSON files
    with open('../../collecting/dict330NE.json', 'w') as exportfile330NE:
        json.dump(dict330NE,
                  exportfile330NE,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict330NN.json', 'w') as exportfile330NN:
        json.dump(dict330NN,
                  exportfile330NN,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict330WithoutCommons.json', 'w') as\
            exportfile330WithoutCommons:
        json.dump(dict330WithoutCommons,
                  exportfile330WithoutCommons,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict330All.json', 'w') as exportfile330All:
        json.dump(dict330,
                  exportfile330All,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710NE.json', 'w') as exportfile710NE:
        json.dump(dict710NE,
                  exportfile710NE,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710NN.json', 'w') as exportfile710NN:
        json.dump(dict710NN,
                  exportfile710NN,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710WithoutCommons.json', 'w') as\
            exportfile710WithoutCommons:
        json.dump(dict710WithoutCommons,
                  exportfile710WithoutCommons,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710All.json', 'w') as exportfile710All:
        json.dump(dict710,
                  exportfile710All,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))