예제 #1
0
    def loadMails(self):
        obj2id = self.obj2id

        mails = []
        for mail in mailLoaderGen(self.archivedir):

            #tags = set(getattr(mail, 'tags', []) + getattr(mail, 'type', []))
            if hasattr(mail, 'tags'):
                alltags.update(mail.tags)
            if hasattr(mail, 'type'):
                alltags[mail.type] += 1


            #mail.tags = [obj2id.add(Tag(tag)) for tag in mail.tags]
            mail.id = obj2id.add(mail)
            mails.append(mail)

        self.mails = mails
    "date",
    "tags",
    "sfbi_url",
    "contact-nom",
    "contact-email",
    "date-candidature",
    "validite",
    "duree",
    "ville",
    "lieu",
    "labo",
]

outdir = "archives_SFBI_AnnotationManuelle"

mails = list(mailLoaderGen())
words = Counter()
for mail in mails:
    mail.sents = list(iterTokenizedSentences(mail.description))
    for sent in mail.sents:
        words.update(sent)

stemmer = Stemmer(set(word for (word, n) in words.items() if n > 10))

for m in mails:
    outf = outdir + m.mailfile.strip("archives_SFBI")
    d = m.__dict__
    d["date"] = date.fromtimestamp(d["timestamp"]).strftime("%d %B %Y")

    with open(outf, "wt") as f:
        d["from"] = d.pop("sender")