예제 #1
0
def main(filepath):
    text = ''

    with codecs.open(filepath, 'r', 'utf-8') as f:
        text = f.read()

    text = pattern_punctuation.sub(' ', text)
    tagged_words = tag(text)

    word_counts = {}

    for tagword in tagged_words:
        transformed_word = transform(tagword)

        if transformed_word == None:
            continue

        if transformed_word in word_counts:
            word_counts[transformed_word] += 1
        else:
            word_counts[transformed_word] = 1

    tuple_list = sorted(word_counts.items(),
                        key=lambda tupl_word: word_counts[tupl_word[0]],
                        reverse=True)

    for tupl in tuple_list:
        print tupl[1], tupl[0]
예제 #2
0
 def test_tag(self):
     # Assert [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")].
     v = de.tag("der grosse Hund")
     self.assertEqual(v, [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")])
     print "pattern.de.tag()"
예제 #3
0
# -*- coding: utf-8 -*-

from pattern.de import parse, split, pprint, tag
# from pprint import pprint

# s = parse('Die Katze liegt auf der Matte.')
# for sentence in split(s):
#     for word in sentence:
#         print(word)
#     pprint(sentence)

pprint(
    parse('Die Katze liegt auf der Matte mit weniger als 10%.',
          tags=True,
          chunks=True,
          relations=True,
          lemmata=True,
          encoding='utf-8',
          tagset="STTS"))

for word, pos in tag('Die Katze liegt auf der Matte mit weniger als 10%.',
                     tagset="STTS"):
    if pos == "ARTDEF" or pos == "NN":
        print word + '\t' + pos
예제 #4
0
파일: test_de.py 프로젝트: iicc/pattern
 def test_tag(self):
     # Assert [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")].
     v = de.tag("der grosse Hund")
     self.assertEqual(v, [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")])
     print("pattern.de.tag()")
예제 #5
0
 def tag_article(article_to_tag):
     new_list = []
     for entry in tag(article_to_tag.content):
         new_list.append(TagTuple.TagTuple(entry[0], entry[1]))
     article_to_tag.tagged_content = new_list
예제 #6
0
dict_name_ne = 'dict_' + filename + '_ne'
json_name_nn = dict_name_nn + '.json'
json_name_ne = dict_name_ne + '.json'

dict_name_nn = {}
dict_name_ne = {}

with open('testfile.txt', 'r') as openfile:
    read_text = openfile.read()
    parsetree_text = parsetree(read_text)
    # pprint(parsetree_text)
    # read_text = read_text.rstrip('\n')
    # print(re.findall(r'[\w]+|[.,!?;]', read_text))
    # pprint(parse(read_text, tags=True, chunks=True, relations=True,
    #              lemmata=True, encoding='utf-8', tagset='STTS'))
    for word, pos in tag(read_text, tagset='STTS'):
        if pos == 'NN':

            singularForm = singularize(word)

            if word == singularForm:
                pass
                # plural = True
                # print word + '\t' + singularForm + '\t' + str(plural)
            else:
                word = singularForm
                # plural = False
                # print word + '\t' + singularForm + '\t' + str(plural)

            if word not in dict_name_nn.keys():
                dict_name_nn[word] = 1
예제 #7
0
def insertTagsToParsedDB(lastID, lastTitel, lastText):
    """Function to process the input, POS-tag it and write it in the
    DB dbparsedText."""
    parsedDatabase = SQLFORM(db.dbParsedText)
    inputDatabase = SQLFORM(db.dbInput)

    dictNN = {}
    dictNE = {}

    # SQL Query to extract ID, Title and Text
    extractQueryInputDB = db.executesql('select id, inputTitle, inputText\
                                        from dbInput')
    lastText = extractQueryInputDB[-1][-1]

    # Begin of For-Loop for POS-Tagging
    for word, postag in tag(lastText, tagset='STTS'):
        word = word.decode('utf-8')

        if postag == 'NN':

            singularFormNN = singularize(word)

            if word == singularFormNN:
                pass
            else:
                word = singularFormNN

            if word not in dictNN.keys():
                dictNN[word] = 1
            elif word in dictNN.keys():
                dictNN[word] += 1
            else:
                pass

        elif postag == 'NE':

            singularFormNE = singularize(word)

            if word == singularFormNE:
                pass
            else:
                word = singularFormNE

            if word not in dictNE.keys():
                dictNE[word] = 1
            elif word in dictNE.keys():
                dictNE[word] += 1
            else:
                pass
        else:
            pass

    listNN = dictNN.items()
    listNE = dictNE.items()

    # for key, value in dict710WC.iteritems():
    #     print key
    # print dict710WC

    # print 'Letzte ID: ' + str(lastID)
    # print 'Letzter Titel: ' + str(lastTitel)
    # print 'Letzter Text: ' + lastText
    # print '\n\n'
    # print dictNE
    # print '\n'
    # print dictNN
    # return extractQueryInputDB
    # return locals()
    return dictNE, dictNN
예제 #8
0
def buildWordList():
    """
    Function to build lists and dictionaries.

    Function exports txt-files incl. dictionaries and lists from both DDC for
    the words that are tagged as NE and NN.
    """
    # Defining list variables
    list330NE = []
    list330NN = []
    list330 = []
    list710NE = []
    list710NN = []
    list710 = []

    # Defining dictionary variables
    dict330NE = {}
    dict330NN = {}
    dict330 = {}
    dict330WithoutCommons = {}
    dict710NE = {}
    dict710NN = {}
    dict710 = {}
    dict710WithoutCommons = {}

    for dirpath, dirs, files in os.walk('../../collecting/temp/'):
        for filename in fnmatch.filter(files, '*.txt'):
            with open('../../collecting/temp/' + dirpath + '/' + filename,
                      'r') as openfile:

                parsefile = openfile.read()
                # parsefile = parse(parsefile)

                ddcFromFilepath = dirpath[-3:]

                for word, postag in tag(parsefile, tagset='STTS'):
                    word = word.decode('utf-8')

                    if ddcFromFilepath == '330':
                        if postag == 'NN':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list330NN.append(word)
                            list330.append(word)

                            if word not in dict330NN.keys():
                                dict330NN[word] = 1
                            elif word in dict330NN.keys():
                                dict330NN[word] += 1
                            else:
                                pass

                        elif postag == 'NE':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list330NE.append(word)
                            list330.append(word)

                            if word not in dict330NE.keys():
                                dict330NE[word] = 1
                            elif word in dict330NE.keys():
                                dict330NE[word] += 1
                            else:
                                pass
                        else:
                            pass

                    elif ddcFromFilepath == '710':
                        if postag == 'NN':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list710NN.append(word)
                            list710.append(word)

                            if word not in dict710NN.keys():
                                dict710NN[word] = 1
                            elif word in dict710NN.keys():
                                dict710NN[word] += 1
                            else:
                                pass

                        elif postag == 'NE':

                            singularForm = singularize(word)

                            if word == singularForm:
                                pass
                            else:
                                word = singularForm

                            list710NE.append(word)
                            list710.append(word)

                            if word not in dict710NE.keys():
                                dict710NE[word] = 1
                            elif word in dict710NE.keys():
                                dict710NE[word] += 1
                            else:
                                pass
                        else:
                            pass

                    else:
                        pass

    # Building list with words common in both DDCs
    listCommonWords = list(set(list330).intersection(list710))

    # Building new lists without the common words
    list330WithoutCommons = []
    list710WithoutCommons = []

    for i in list330:
        if i not in listCommonWords:
            list330WithoutCommons.append(i)

    for i in list710:
        if i not in listCommonWords:
            list710WithoutCommons.append(i)

    # Building new dictionaries without the common words
    for i in list330WithoutCommons:
        if i not in dict330WithoutCommons.keys():
            dict330WithoutCommons[i] = 1
        elif i in dict330WithoutCommons.keys():
            dict330WithoutCommons[i] += 1
        else:
            pass

    for i in list710WithoutCommons:
        if i not in dict710WithoutCommons.keys():
            dict710WithoutCommons[i] = 1
        elif i in dict710WithoutCommons.keys():
            dict710WithoutCommons[i] += 1
        else:
            pass

    # Merge NE and NN dictionaries into one dictionary
    dict330.update(dict330NE)
    dict330.update(dict330NN)
    dict710.update(dict710NE)
    dict710.update(dict710NN)

    # Dump dictionaries into JSON files
    with open('../../collecting/dict330NE.json', 'w') as exportfile330NE:
        json.dump(dict330NE,
                  exportfile330NE,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict330NN.json', 'w') as exportfile330NN:
        json.dump(dict330NN,
                  exportfile330NN,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict330WithoutCommons.json', 'w') as\
            exportfile330WithoutCommons:
        json.dump(dict330WithoutCommons,
                  exportfile330WithoutCommons,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict330All.json', 'w') as exportfile330All:
        json.dump(dict330,
                  exportfile330All,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710NE.json', 'w') as exportfile710NE:
        json.dump(dict710NE,
                  exportfile710NE,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710NN.json', 'w') as exportfile710NN:
        json.dump(dict710NN,
                  exportfile710NN,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710WithoutCommons.json', 'w') as\
            exportfile710WithoutCommons:
        json.dump(dict710WithoutCommons,
                  exportfile710WithoutCommons,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))

    with open('../../collecting/dict710All.json', 'w') as exportfile710All:
        json.dump(dict710,
                  exportfile710All,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False,
                  separators=(',', ': '))