def __init__(self, nlp: Language = None, support_overlap: bool = False, log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0, **kwargs): """ @param nlp: Spacy Language model @param support_overlap: whether need to support overlapped annotations @param log_level: logging level configuration @param encoding: txt encoding @param doc_name_depth: depth of parent directories to add into doc_name default is 0: only use file name 1: use 1 level parent directory name + file name -1: use full absolution path if you are dealing with multiple directories,this is helpful to locate the original files @param kwargs:other parameters """ for param_name, value in kwargs.items(): setattr(self, param_name, value) if nlp is None: raise NameError('parameter "nlp" need to be defined') self.nlp = nlp self.encoding = encoding self.doc_name_depth = doc_name_depth self.support_overlap = support_overlap self.set_logger(log_level) if not Doc.has_extension('doc_name'): Doc.set_extension('doc_name', default='') pass
def __init__(self): super().__init__() if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) if not Token.has_extension('is_lexical'): Token.set_extension('is_lexical', default=False)
def __init__(self, paths=None): """ paths:list -> a list of string, each of which represents a path to one of the corpora needed as listed below. This method initialized constant accross the object to be used by other methods of this object """ super().__init__() if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) Doc.set_extension('ngsl_words', default=[]) Doc.set_extension('nawl_words', default=[]) Doc.set_extension('tsl_words', default=[]) Doc.set_extension('fpc_words', default=[]) Doc.set_extension('cocaacad_words', default=[]) Doc.set_extension('cocatech_words', default=[]) Doc.set_extension('cocagenband1_words', default=[]) Doc.set_extension('cocagenband2_words', default=[]) Doc.set_extension('cocagenband3_words', default=[]) if paths is None: #file locations self.fnameNGSL = os.path.join( os.path.dirname(__file__), 'Corpora/NGSL+1.01+by+band - Frequency.csv') self.fnameNAWL = os.path.join(os.path.dirname(__file__), 'Corpora/NAWL_SFI.csv') self.fnameBSL = os.path.join( os.path.dirname(__file__), 'Corpora/BSL_1.01_SFI_freq_bands.csv') self.fnameTSL = os.path.join( os.path.dirname(__file__), 'Corpora/TSL+1.1+Ranked+by+Frequency - TSL.csv') self.fnameCOCAAcad = os.path.join(os.path.dirname(__file__), 'Corpora/COCA Academic.csv') self.fnameCOCATech = os.path.join(os.path.dirname(__file__), 'Corpora/COCA Technical.csv') self.fnameCOCAGen = os.path.join(os.path.dirname(__file__), 'Corpora/COCA General.csv') else: #file locations passed as a parameter to the construct self.fnameNGSL = paths[0] self.fnameNAWL = paths[1] self.fnameBSL = paths[2] self.fnameTSL = paths[3] self.fnameCOCAAcad = paths[4] self.fnameCOCATech = paths[5] self.fnameCOCAGen = paths[6] ## Taken by Vishal's code. self.NGSLTotal = 273613534 self.NAWLTotal = 288176225 self.TSLTotal = 1560194 self.BSLTotal = 64651722 self.COCAAcadTotal = 120032441 # read the corpora self.read_corpora() self.nlp = spacy.load("en_core_web_sm")
def __init__(self, clf, extension='score'): """ :type clf: Classifier, needs to have a predict(X) function """ self.clf = clf self.extension = extension if not Doc.has_extension(extension): Doc.set_extension(extension, default=-1)
def __init__(self): if not Doc.has_extension('taaled_lemmas'): Doc.set_extension('taaled_lemmas', default=[]) if not Doc.has_extension('context_tokens'): Doc.set_extension('context_tokens', default=[]) if not Doc.has_extension('function_tokens'): Doc.set_extension('function_tokens', default=[]) # Load TAALED word list files # source: https://github.com/kristopherkyle/TAALED/tree/master/TAALED_1_3_1_Py3/dep_files module_path = os.path.abspath(os.path.dirname(__file__)) adj_lem_list_path = os.path.join(module_path, "Corpora/adj_lem_list.txt") real_words_path = os.path.join(module_path, "Corpora/real_words.txt") self.adj_word_list = open(adj_lem_list_path, "r", errors='ignore').read().split("\n")[:-1] self.real_word_list = open(real_words_path, "r", errors='ignore').read().split("\n")[:-1]
def test_docs_to_sents_df(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = EhostDirReader(nlp=self.nlp, support_overlap=False, recursive=True, schema_file='data/ehost_test_corpus/config/projectschema.xml') docs = dir_reader.read(txt_dir='data/ehost_test_corpus/') df = Vectorizer.docs_to_sents_df(docs, type_filter=set(), track_doc_name=True) print(df) assert (df.shape[0] == 12) df = Vectorizer.docs_to_sents_df(docs, type_filter=set()) print(df) df = Vectorizer.docs_to_sents_df(docs, sent_window=2) assert (df.shape[0] == 20)
def __init__(self, first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME, last_name_extension_name=LastNameListMatcher.EXTENSION_NAME): self.token_extension_name = self.TOKEN_EXTENSION_NAME self.span_extension_name = self.SPAN_EXTENSION_NAME self.doc_extension_name = self.DOC_EXTENSION_NAME self.first_name_extension_name = first_name_extension_name self.last_name_extension_name = last_name_extension_name if not Token.has_extension(self.token_extension_name): Token.set_extension(self.token_extension_name, default=self.ANOT_NONE) if not Span.has_extension(self.span_extension_name): Span.set_extension(self.span_extension_name, getter=self.is_full_name_getter) if not Doc.has_extension(self.doc_extension_name): Doc.set_extension(self.doc_extension_name, default=[])
def init_component(self): if not Doc.has_extension("extract_keywords"): Doc.set_extension("extract_keywords", method=self.extract_keywords) if not Doc.has_extension("kw_candidates"): Doc.set_extension("kw_candidates", default=None)
def __init__(self): if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) if not Doc.has_extension(self.name + '_legacy'): Doc.set_extension(self.name + '_legacy', default=[])
def __init__(self): if not Doc.has_extension("features"): Doc.set_extension("features", default=OrderedDict())