Пример #1
0
 def test_get_epithet_of_author(self):
     """Test get_epithet_of_author()."""
     epithet = get_epithet_of_author('0016')
     self.assertEqual(epithet, 'Historici/-ae')
Пример #2
0
    for file in files:
        filepath = os.path.join(user_dir, file)
        with open(filepath) as fo:
            #TODO rm words less the 3 chars long
            yield file[3:-4], fo.read()

t0 = dt.datetime.utcnow()

map_id_author = get_id_author()

df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet'])

for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'):
    author = map_id_author[_id]
    epithet = get_epithet_of_author(_id)
    df = df.append({'id': _id, 'author': author, 'text': text, 'epithet': epithet}, ignore_index=True)

print(df.shape)
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
print('Number of texts:', len(df))


text_list = df['text'].tolist()

# make a list of short texts to drop
# For pres, get distributions of words per doc
short_text_drop_index = [index if len(text) > 500 else None for index, text in enumerate(text_list) ]  # ~100 words


t0 = dt.datetime.utcnow()
Пример #3
0
        with open(filepath) as fo:
            #TODO rm words less the 3 chars long
            yield file[3:-4], fo.read()


# In[3]:

t0 = dt.datetime.utcnow()

map_id_author = get_id_author()

df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet'])

for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'):
    author = map_id_author[_id]
    epithet = get_epithet_of_author(_id)
    df = df.append(
        {
            'id': _id,
            'author': author,
            'text': text,
            'epithet': epithet
        },
        ignore_index=True)

print(df.shape)
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
print('Number of texts:', len(df))

# In[4]:
Пример #4
0
 def test_get_epithet_of_author(self):
     """Test get_epithet_of_author()."""
     epithet = get_epithet_of_author('0016')
     self.assertEqual(epithet, 'Historici/-ae')
Пример #5
0
 def test_get_epithet_of_author(self):
     """Test get_epithet_of_author()."""
     epithet = get_epithet_of_author("0016")
     self.assertEqual(epithet, "Historici/-ae")