def parse_corpus(to_process_file): file_path = to_process_file doc_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//Article', text='./text/text()', id='./article-id/text()') corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(list(doc_preprocessor)) return corpus_parser
def doc_creation(df_features, session): # write the subset to a .csv and convert it to a .tsv file df_features.to_csv('dataset.csv', header=False) # csv.writer(open('dataset.tsv', 'w+'), delimiter=' ').writerows(csv.reader(open("dataset.csv"))) doc_preprocessor = TSVDocPreprocessor('dataset.tsv') corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count())
def parse_wikipedia_dump( dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/', clear=False, parallelism=8): logging.info("Corpus parsing start") session = SnorkelSession() corpus_parser = CorpusParser(parser=Spacy()) onlyfiles = [ f for f in listdir(dumps_folder_path) if isfile(join(dumps_folder_path, f)) ] i = 0 for file in onlyfiles: if file.endswith(".xml"): print file doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path + file, doc='.//doc', text='./text()', id='./@title') if i > 0: clear = False try: corpus_parser.apply(doc_preprocessor, clear=clear, parallelism=parallelism) except IntegrityError as e: print("Already parsed " + file) logging.error("Already parsed " + file) i = i + 1 #logging.debug("Documents: %d", session.query(Document).count()) #logging.debug("Sentences: %d", session.query(Sentence).count()) logging.info("Corpus parsing end")
if line.split("\t")[0] in docID or len(line.split("\t")) != 2: continue docID.add(line.split("\t")[0]) fout.write(line.replace("\n", " ").strip() + "\n") print("total docID count", len(docID)) doc_preprocessor = TSVDocPreprocessor(newfile, encoding="utf-8", max_docs=n_docs) from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence # defined in context.py file if session.query(Document).count() == 0: corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) # ,parallelism=5) print("Documents:", session.query(Document).count()) from snorkel import SnorkelSession from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence from collections import defaultdict import numpy as np session = SnorkelSession() docs = session.query(Document).all() sents = session.query(Sentence).all() # get all sentences from snorkel.db
def __init__(self, parser=None, fn=None): self.parser = parser or Spacy() super(CorpusParser, self).__init__(CorpusParserUDF, parser=self.parser, fn=fn)