Пример #1
0
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.parser import TSVDocParser
from snorkel.parser import TSVDocParser
doc_parser = TSVDocParser(path='data/proteincorpus_sm.tsv')
from snorkel.parser import SentenceParser

sent_parser = SentenceParser()

from snorkel.parser import CorpusParser

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, 'Protein Training')

for name, path in [('Protein Development', 'data/protein_dev.tsv'),
                   ('Protein Test', 'data/protein_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()


from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one()
corpus
Пример #2
0

from snorkel import SnorkelSession
session = SnorkelSession()
import os

from snorkel.parser import TSVDocParser
doc_parser = TSVDocParser(path="data/clinton_train.tsv")

from snorkel.parser import SentenceParser

sent_parser = SentenceParser()
from snorkel.parser import CorpusParser

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, "Emails Training")
session.add(corpus)
session.commit()


for name, path in [('Emails Development', 'data/clinton_dev.tsv'),
                   ('Emails Test', 'data/clinton_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()

sentences = set()
for document in corpus:
    for sentence in document.sentences:
        if number_of_people(sentence) < 5:
            sentences.add(sentence)