def __init__(self, root, fileids='.*'): """ Corpus reader designed to work with National Corpus of Polish. See http://nkjp.pl/ for more details about NKJP. use example: import nltk import nkjp from nkjp import NKJPCorpusReader x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus x.header() x.raw() x.words() x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html x.sents() x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s) x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy']) x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp']) """ if isinstance(fileids, string_types): XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml') else: XMLCorpusReader.__init__( self, root, [fileid + '/header.xml' for fileid in fileids] ) self._paths = self.get_paths()
def feature_apply(feature_extractor, feature_vector, attribute, number_of_file): """ Extract features from each document :param feature_extractor: function that extract features :param feature_vector: contains a list of features :param attribute: indicate if the process for gender or age feature extraction :param number_of_file: number of document to be processed :return:vector that contain the extracted features """ corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/en' #corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/meTets' newcorpus = XMLCorpusReader(corpus_root, '.*') i=0 feature_set = [] doc_list = newcorpus.fileids() print len(doc_list) for doc in doc_list[:number_of_file]: i+=1 if i%50==0: print i doc = newcorpus.xml(doc) number_of_conversation=int(doc[0].attrib["count"]) #print(doc[0].attrib["count"]) txt = " ".join([doc[0][j].text for j in range(number_of_conversation) if doc[0][j].text is not None]) #print txt if textstat.sentence_count(txt) != 0: feature_set.append((feature_extractor(txt, feature_vector), doc.attrib[attribute])) return feature_set
def __init__(self, *args, **kwargs): if 'textid_file' in kwargs: self._textids = kwargs['textid_file'] else: self._textids = None XMLCorpusReader.__init__(self, *args) CategorizedCorpusReader.__init__(self, kwargs) self._init_textids()
def __init__(self, root, fileids, wrap_etree=False): XMLCorpusReader.__init__(self, root, fileids, wrap_etree) self._lemma_to_class = defaultdict(list) """A dictionary mapping from verb lemma strings to lists of verbnet class identifiers.""" self._wordnet_to_class = defaultdict(list) """A dictionary mapping from wordnet identifier strings to lists of verbnet class identifiers.""" self._class_to_fileid = {} """A dictionary mapping from class identifiers to corresponding file identifiers. The keys of this dictionary provide a complete list of all classes and subclasses.""" self._shortid_to_longid = {} # Initialize the dictionaries. Use the quick (regexp-based) # method instead of the slow (xml-based) method, because it # runs 2-30 times faster. self._quick_index()
def test_set(corpus_dir, feature_extrator, vect_path, i): """ Read ,process the test set and extract features for each document :param corpus_dir:path of the test set :param feature_extrator: function that extract features :param vect_path: :param i:index of class in the true_pred dictionay values; if 0 it refers to the gender else it refers to the age :return:vector that contain the extracted features """ vect = create_feature_vect(vect_path) newcorpus = XMLCorpusReader(corpus_dir, '.*') doc_list = newcorpus.fileids() test_feature_set = [] true_pred = extract_true_pred(corpus_dir[:-2]+"truth-en.txt") for doc in doc_list: xml_name = doc doc = newcorpus.xml(doc) print(doc[0].attrib["count"]) txt = fetch_text(doc) if (textstat.sentence_count(txt) != 0) and (txt != ""): test_feature_set.append((feature_extrator(txt, vect), true_pred[xml_name][i])) return test_feature_set
def xml(self, fileid=None): return XMLCorpusReader.xml(self, fileid)
def __init__(self, root, fileids='.*'): XMLCorpusReader.__init__(self, root, fileids)
def __init__(self, root, fileids, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy
def __init__(self, root, fileids, wordnet, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy self._wordnet = wordnet
def __init__(self, root, fileids): XMLCorpusReader.__init__(self, root, fileids)
def xml(self, fileids=None, categories=None): fileids, _ = self._resolve(fileids, categories) if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0]) else: raise TypeError('Expected a single file')
def __init__(self, root, fileid): self.path = root + fileid XMLCorpusReader.__init__(self, root, fileid)
def __init__(self, root, fileid): XMLCorpusReader.__init__(self, root, fileid) self._fileid = self._fileids[0] self.elt = self.xml() self.data = _xml_to_dict(self.elt)
def raw(self, fileids=None): return XMLCorpusReader.raw(self, fileids)