示例#1
0
def feature_extraction(corpus_path, output, features):
    '''Perform a feature extraction'''
    download_corpus(corpus_path)
    setup_tokenizers(terminal_punctuation=('.', '?', '!'), language='english')
    qcrit.extract_features.main(
        corpus_dir=os.path.join(*corpus_path),
        file_extension_to_parse_function={
            'txt': parse_txt,
        },
        output_file=output,
        features=features,
    )
示例#2
0
def main():
    '''Main'''
    corpus_path = ('tesserae', 'texts', 'grc')
    download_corpus(corpus_path)

    #'FULL STOP', 'SEMICOLON', 'GREEK QUESTION MARK'
    setup_tokenizers(terminal_punctuation=('.', ';', ';'))

    if len(sys.argv) > 2 and sys.argv[2] == '-u':
        import qcrit.features.universal_features  #seemingly unused, but allows the recognition of features
    else:
        import qcrit.features.ancient_greek_features  #seemingly unused, but allows the recognition of features

    #Feature extractions
    qcrit.extract_features.main(
        os.path.join(*corpus_path),
        {'tess': qcrit.extract_features.parse_tess},

        #Exclude all files of genres not specified. Exclude composite files no matter what
        excluded_paths=composite_files,
        output_file=None if len(sys.argv) <= 1 else sys.argv[1])
示例#3
0
# -*- coding: utf-8 -*-
#pylint: disable = missing-docstring, blacklisted-name, unused-argument, invalid-name, line-too-long, protected-access
import unittest
import re

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars

import context  #pylint: disable=unused-import
from qcrit import textual_feature

#[^\s\d’”\'\")\)\]\}\.,:;]
#[“‘—\-†&vâ\*\^(α-ωΑ-Ὠ`̔]
#΄´´``′″‴
textual_feature.setup_tokenizers(terminal_punctuation=('.', ';', ';'))
p = PunktLanguageVars()
#TODO don't mess with the PunktLanguageVars instance variables, mess with the class variables
p._re_word_tokenizer = re.compile(
    PunktLanguageVars._word_tokenize_fmt % {
        'NonWord':
        r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
        'MultiChar':
        PunktLanguageVars._re_multi_char_punct,
        'WordStart':
        r"[^\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]",
    }, re.UNICODE | re.VERBOSE)
p._re_period_context = re.compile(
    PunktLanguageVars._period_context_fmt % {
        'NonWord':
        r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
        'SentEndChars': p._re_sent_end_chars,
    }, re.UNICODE | re.VERBOSE)
#pylint: disable = missing-docstring, blacklisted-name, unused-argument, invalid-name
'''Test feature extraction'''
import unittest

import context #pylint: disable=unused-import
from qcrit.extract_features import main, parse_tess
from qcrit.textual_feature import textual_feature, setup_tokenizers

#Run this file with "-b" to ignore output in passing tests (failing tests still display output)

setup_tokenizers(terminal_punctuation=('.', ';', ';')) #'FULL STOP', 'SEMICOLON', 'GREEK QUESTION MARK'

@textual_feature(tokenize_type='words', debug=True)
def dummy_feature(text):
	pass

class TestExtractFeatures(unittest.TestCase):

	def testAllNone(self):
		self.assertRaises(ValueError, main, corpus_dir=None, file_extension_to_parse_function=None)

	def testInvalidCorpusDirectory(self):
		self.assertRaises(ValueError, main, corpus_dir='abc', file_extension_to_parse_function={'tess': parse_tess})

	def testExcludedPaths(self):
		self.assertRaises(ValueError, main, corpus_dir='.', file_extension_to_parse_function={'tess': parse_tess}, excluded_paths=[])

	def testEmptyFeatures(self):
		self.assertRaises(ValueError, main, corpus_dir='.', file_extension_to_parse_function={'tess': parse_tess}, features=[])

	def testOutputAlreadyExists(self):
'''
Latin features
'''
import os
import subprocess
import sys
import re

import qcrit.extract_features
from qcrit.textual_feature import textual_feature, setup_tokenizers

CORPUS_DIR = os.path.join('tesserae', 'texts', 'la')

TERMINAL_PUNCTUATION = ('.', '?', '!')
setup_tokenizers(terminal_punctuation=TERMINAL_PUNCTUATION)


def _download_corpus():
    '''
	Downloads latin corpus from tesserae

	Sparse checkout reference here:
	#https://stackoverflow.com/a/28039894/7102572
	'''
    if not os.path.isdir('tesserae'):
        try:
            cmd_list = (
                'mkdir tesserae',
                'git -C tesserae init',
                'git -C tesserae remote add origin https://github.com/timgianitsos/tesserae.git',
                'git -C tesserae config core.sparseCheckout true',
示例#6
0
from shlex import quote

_CURRENT_DIR = os.path.dirname(__file__)
#If the output file already exists, the feature extraction code will not override it
#Delete the output file so that the demo can create one
if os.path.isfile(os.path.join(_CURRENT_DIR, 'output.pickle')):
    os.system('rm ' + quote(os.path.join(_CURRENT_DIR, 'output.pickle')))

import context  #pylint: disable=unused-import
import qcrit.extract_features
from qcrit.textual_feature import textual_feature, setup_tokenizers
from functools import reduce
from unicodedata import normalize

#Let sentence tokenizer know that periods and semicolons are the punctuation marks that end sentences
setup_tokenizers(terminal_punctuation=('.', ';'))


#Using 'words' makes the input 'text' parameter become a list of words
@textual_feature(tokenize_type='words')
def num_conjunctions(text):  #parameter must be the text of a file
    return reduce(
        lambda count, word: count + (1 if word in {
            normalize('NFD', val)
            for val in ['καί', 'καὶ', 'ἀλλά', 'ἀλλὰ', 'ἤ', 'ἢ']
        } else 0), text, 0)


#Using 'sentences' makes the input 'text' parameter become a list of sentences
@textual_feature(tokenize_type='sentences')
def mean_sentence_length(text):  #parameter must be the text of a file