def parse_corpus(to_process_file):
    file_path = to_process_file
    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//Article',
                                               text='./text/text()',
                                               id='./article-id/text()')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(list(doc_preprocessor))
    return corpus_parser
示例#2
0
def doc_creation(df_features, session):
    # write the subset to a .csv and convert it to a .tsv file
    df_features.to_csv('dataset.csv', header=False)
    #
    csv.writer(open('dataset.tsv', 'w+'),
               delimiter='	').writerows(csv.reader(open("dataset.csv")))
    doc_preprocessor = TSVDocPreprocessor('dataset.tsv')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor)
    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())
示例#3
0
def parse_wikipedia_dump(
        dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/',
        clear=False,
        parallelism=8):

    logging.info("Corpus parsing start")
    session = SnorkelSession()

    corpus_parser = CorpusParser(parser=Spacy())
    onlyfiles = [
        f for f in listdir(dumps_folder_path)
        if isfile(join(dumps_folder_path, f))
    ]

    i = 0
    for file in onlyfiles:
        if file.endswith(".xml"):
            print file
            doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path +
                                                       file,
                                                       doc='.//doc',
                                                       text='./text()',
                                                       id='./@title')
            if i > 0:
                clear = False
            try:
                corpus_parser.apply(doc_preprocessor,
                                    clear=clear,
                                    parallelism=parallelism)
            except IntegrityError as e:
                print("Already parsed " + file)
                logging.error("Already parsed " + file)
            i = i + 1
    #logging.debug("Documents: %d", session.query(Document).count())
    #logging.debug("Sentences: %d", session.query(Sentence).count())
    logging.info("Corpus parsing end")
        if line.split("\t")[0] in docID or len(line.split("\t")) != 2:
            continue
        docID.add(line.split("\t")[0])
        fout.write(line.replace("\n", " ").strip() + "\n")

print("total docID count", len(docID))
doc_preprocessor = TSVDocPreprocessor(newfile,
                                      encoding="utf-8",
                                      max_docs=n_docs)

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence  # defined in context.py file

if session.query(Document).count() == 0:
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor, count=n_docs)  # ,parallelism=5)

print("Documents:", session.query(Document).count())

from snorkel import SnorkelSession
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from collections import defaultdict
import numpy as np

session = SnorkelSession()
docs = session.query(Document).all()
sents = session.query(Sentence).all()  # get all sentences from snorkel.db
 def __init__(self, parser=None, fn=None):
     self.parser = parser or Spacy()
     super(CorpusParser, self).__init__(CorpusParserUDF,
                                        parser=self.parser,
                                        fn=fn)