예제 #1
0
                                         ('embeddings', WordEmbeddings(model)))
backoff_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3)))

X_full = full_feature_vectorizer.fit_transform([[word for word in doc] for doc in X])
X_backoff = backoff_feature_vectorizer.fit_transform([[word for word in doc] for doc in X])
y = LabelEncoder().fit_transform([l for labels in y for l in labels])

clf_full = LogisticRegression().fit(X_full, y)
clf_backoff = LogisticRegression().fit(X_backoff, y)
frogger = Frog(int(sys.argv[3]))

for filename in glob.glob(os.path.join(sys.argv[4], "*")):
    print filename
    characters = Counter()
    with codecs.open(filename, encoding='utf-8') as infile:
        doc = infile.read()
        document = frogger.tag(doc)
        document = [[f.decode('utf-8') for f in w[:-1]]
                    for sentence in document for w in sentence]
        words = [word[0] for word in document]
    X_test_full = full_feature_vectorizer.transform([document])
    X_test_backoff = backoff_feature_vectorizer.transform([document])
    for i, word in enumerate(X_test_full):
        if words[i].lower() not in model:
            pred = clf_backoff.predict(X_test_backoff[i])[0]
        else:
            pred = clf_full.predict(X_test_full[i])[0]
        if pred == 1 and document[i][2] in ('N', 'SPEC'):
            characters[document[i][0]] += 1
    print ', '.join(sorted(characters, key=characters.__getitem__, reverse=True))
예제 #2
0
for filename in glob.glob(os.path.join(sys.argv[2], "SINVS*.ann")):
    if 'anomalies' in filename:
        continue
    story = Story.load(filename)
    if sys.argv[3] == 'chars':
        characters = {(start, end): (id, name)
                      for character in story.characters
                      for id, name, start, end in character.chain}
    else:
        characters = {(start, end): (id, name)
                      for location in story.locations
                      for id, name, start, end in location.chain}

    with codecs.open(filename.replace(".ann", ".txt"), encoding='utf-8') as f:
        orig_text = f.read()
        tokens = [word for sent in frogger.tag(orig_text) for word in sent]
        offsets = list(
            token_boundaries([t[0].decode('utf-8') for t in tokens],
                             orig_text))
        found_characters = {(start, end): False for start, end in characters}
        for char_start, char_end in characters:
            start_found = False
            for i, (start, end) in enumerate(offsets):
                if (start == char_start or (start < char_start < end)
                        or (char_start < start < char_end)):
                    start_found = True
                    tokens[i] = Word(*(tokens[i][:-1] + ('animate', )))
        for token in tokens:
            print '\t'.join(token)
        print '<FB/>'
예제 #3
0
for filename in glob.glob(os.path.join(sys.argv[2], "SINVS*.ann")):
    if 'anomalies' in filename:
        continue
    story = Story.load(filename)
    if sys.argv[3] == 'chars':
        characters = {(start, end): (id, name)
                      for character in story.characters
                      for id, name, start, end in character.chain}
    else:
        characters = {(start, end): (id, name)
                      for location in story.locations
                      for id, name, start, end in location.chain}

    with codecs.open(filename.replace(".ann", ".txt"), encoding='utf-8') as f:
        orig_text = f.read()
        tokens = [word for sent in frogger.tag(orig_text) for word in sent]
        offsets = list(token_boundaries([t[0].decode('utf-8') for t in tokens], orig_text))
        found_characters = {(start, end): False for start, end in characters}
        for char_start, char_end in characters:
            start_found = False
            for i, (start, end) in enumerate(offsets):
                if (start == char_start or
                    (start < char_start < end) or
                    (char_start < start < char_end)):
                    start_found = True
                    tokens[i] = Word(*(tokens[i][:-1] + ('animate',)))
        for token in tokens:
            print '\t'.join(token)
        print '<FB/>'