def timespan_sql_no_dateinfo( corpus: Corpus = Corpus(), out: Export = Export("korp_timespan/timespan.sql"), docs: AllDocuments = AllDocuments(), token: AnnotationAllDocs = AnnotationAllDocs("<token>")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() token_count = 0 for doc in docs: tokens = token.read_spans(doc) token_count += len(list(tokens)) rows_date = [{ "corpus": corpus_name, "datefrom": "0" * 8, "dateto": "0" * 8, "tokens": token_count }] rows_datetime = [{ "corpus": corpus_name, "datefrom": "0" * 14, "dateto": "0" * 14, "tokens": token_count }] create_sql(corpus_name, out, rows_date, rows_datetime)
def info_sentences( out: OutputCommonData = OutputCommonData("cwb.sentencecount"), sentence: AnnotationAllDocs = AnnotationAllDocs("<sentence>"), docs: AllDocuments = AllDocuments()): """Determine how many sentences there are in the corpus.""" # Read sentence annotation and count the sentences sentence_count = 0 for doc in docs: try: sentence_count += len(list(sentence.read_spans(doc))) except FileNotFoundError: pass if sentence_count == 0: log.info("No sentence information found in corpus") # Write sentencecount data out.write(str(sentence_count))