def __init__(self, root, zipfile, fileids): if isinstance(root, basestring): root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # convert to a ZipFilePathPointer root = ZipFilePathPointer(root.join(zipfile)) CorpusReader.__init__(self, root, fileids) self._parse_char_replacements()
def __init__(self, root, fileids=DOC_PATTERN, word_tokenizer=TweetTokenizer(), encoding='utf8', **kwargs): if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) TwitterCorpusReader.__init__(self, root, fileids, encoding) if isinstance(root, string_types) and not isinstance(root, PathPointer): m = re.match('(.*\.gz)/?(.*\.zip)/?(.*)$|', root) #'(.*\.zip)/?(.*\.gz)/?(.*)$|' gzipfile, zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) elif gzipfile: root = ZipFilePathPointer(gzipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') self._root = root self.current_doc = []
def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus) logging.info('Initialize PropBank reader.') if with_doc: self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') logging.info('Found {} treebank files.'.format( len(self.wsj_treebank.fileids()))) self.propbank = PropbankCorpusReader( root=FileSystemPathPointer(params.root), propfile=params.propfile, framefiles=params.frame_files, verbsfile=params.verbs_file, ) self.propbank_annos = defaultdict(list) logging.info("Loading PropBank Data.") for inst in self.propbank.instances(): docid = inst.fileid.split('/')[-1] self.propbank_annos[docid].append(inst) self.stats = { 'predicate_count': 0, 'argument_count': 0, }
def ctb_clear(): ctb_dir = path.join(home_dir, 'normal_ctb_test') reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*' # reg='.*dev' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) for fid in fileids: f1 = open('normal_ctb_test/' + fid, mode='r') f2 = open('for_clearnlp/' + fid, mode='w') for line in f1.readlines(): if line.find('<S>') >= 0 or line.find('</S>') >= 0: continue f2.write(line) f1.close() f2.close()
def static_dp(): ctb_dir = path.join(home_dir, 'for_clearnlp') # reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*' reg = '(.*dep)*' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) ct = 0 for fid in fileids: f2 = open('for_clearnlp/' + fid, mode='r') for line in f2.readlines(): if line == '\n': ct += 1 f2.close() print(ct)
def read_knbc(train_file, test_file, reference_file): root = nltk.data.find('corpora/knbc/corpus1') fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') sentences = knbc.sents() write_train(sentences[0:4000], train_file) write_test(sentences[4000:-1], test_file) write_reference(sentences[4000:-1], reference_file)
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False): if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] paths = [FileSystemPathPointer(self._root.join(f)) for f in fileids] if include_encoding and include_fileid: return list( zip(paths, [self.encoding(f) for f in fileids], fileids)) elif include_fileid: return list(zip(paths, fileids)) elif include_encoding: return list(zip(paths, [self.encoding(f) for f in fileids])) else: return paths
def __init__(self, config_path): conf = load_file_config(config_path) logging.info(json.dumps(conf, indent=2)) params = GCDataSet.GCConfig(config=conf) super().__init__(params) wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') self.nombank = NombankCorpusReader( root=FileSystemPathPointer(params.nombank_path), nomfile=params.nomfile, framefiles=params.frame_file_pattern, nounsfile=params.nombank_nouns_file, parse_fileid_xform=lambda s: s[4:], parse_corpus=wsj_treebank)
def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus, with_doc) self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset="wsj", encoding="ascii", ) logging.info("Found {} treebank files.".format( len(self.wsj_treebank.fileids()))) self.nombank = NombankCorpusReader( root=FileSystemPathPointer(params.nombank_path), nomfile=params.nomfile, framefiles=params.frame_file_pattern, nounsfile=params.nombank_nouns_file, parse_fileid_xform=lambda s: s[4:], parse_corpus=self.wsj_treebank, ) logging.info("Loading G&C annotations.") self.gc_annos = self.load_gc_annotations() num_gc_preds = sum( [len(preds) for (d, preds) in self.gc_annos.items()]) logging.info(f"Loaded {num_gc_preds} predicates") logging.info("Loading Nombank annotations") self.nombank_annos = defaultdict(list) for nb_instance in self.nombank.instances(): docid = nb_instance.fileid.split("/")[-1] self.nombank_annos[docid].append(nb_instance) self.stats = { "target_pred_count": Counter(), "predicates_with_implicit": Counter(), "implicit_slots": Counter(), } self.stat_dir = params.stat_dir
def __init__(self, root, fileids, encoding='utf8', tagset=None): """ :type root: PathPointer or str :param root: A path pointer identifying the root directory for this corpus. If a string is specified, then it will be converted to a ``PathPointer`` automatically. :param fileids: A list of the files that make up this corpus. This list can either be specified explicitly, as a list of strings; or implicitly, as a regular expression over file paths. The absolute path for each file will be constructed by joining the reader's root to each file name. :param encoding: The default unicode encoding for the files that make up the corpus. The value of ``encoding`` can be any of the following: - A string: ``encoding`` is the encoding name for all files. - A dictionary: ``encoding[file_id]`` is the encoding name for the file whose identifier is ``file_id``. If ``file_id`` is not in ``encoding``, then the file contents will be processed using non-unicode byte strings. - A list: ``encoding`` should be a list of ``(regexp, encoding)`` tuples. The encoding for a file whose identifier is ``file_id`` will be the ``encoding`` value for the first tuple whose ``regexp`` matches the ``file_id``. If no tuple's ``regexp`` matches the ``file_id``, the file contents will be processed using non-unicode byte strings. - None: the file contents of all files will be processed using non-unicode byte strings. :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the tagged_...() methods. """ # Convert the root to a path pointer, if necessary. if isinstance(root, compat.string_types) and not isinstance(root, PathPointer): m = re.match('(.*\.zip)/?(.*)$|', root) zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # If `fileids` is a regexp, then expand it. if isinstance(fileids, compat.string_types): fileids = find_corpus_fileids(root, fileids) self._fileids = fileids """A list of the relative paths for the fileids that make up this corpus.""" self._root = root """The root directory for this corpus.""" # If encoding was specified as a list of regexps, then convert # it to a dictionary. if isinstance(encoding, list): encoding_dict = {} for fileid in self._fileids: for x in encoding: (regexp, enc) = x if re.match(regexp, fileid): encoding_dict[fileid] = enc break encoding = encoding_dict self._encoding = encoding """The default unicode encoding for the fileids that make up this corpus. If ``encoding`` is None, then the file contents are processed using byte strings.""" self._tagset = tagset
wsj_treebank = BracketParseCorpusReader( root=cfg.wsj_root, fileids=cfg.wsj_file_pattern, tagset='wsj', encoding='ascii' ) def fileid_xform_function(fileid): # result = re.sub(r'^wsj/', '', fileid) # return result return fileid[4:] propbank = PropbankCorpusReader( root=FileSystemPathPointer(cfg.propbank_root), propfile=cfg.propbank_file, framefiles=cfg.frame_file_pattern, verbsfile=cfg.propbank_verbs_file, parse_fileid_xform=fileid_xform_function, parse_corpus=wsj_treebank ) nombank = NombankCorpusReader( root=FileSystemPathPointer(cfg.nombank_root), nomfile=cfg.nombank_file, framefiles=cfg.frame_file_pattern, nounsfile=cfg.nombank_nouns_file, parse_fileid_xform=fileid_xform_function, parse_corpus=wsj_treebank )
from nltk.data import FileSystemPathPointer home_dir = path.join(path.dirname(__file__), './') import re # ctb_dir = '/home/lnn/Downloads/ctb_test' ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/out_paper' # ctb_dir = '/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/normal_ctb_test' # ctb_dir = '/home/nana/Documents/pycharmforlinux/upenn_transfer/normal_ctb_test_v1' ctb_path = path.join(ctb_dir, 'ctb.secondtest.clean') counts = 0 fc = open(ctb_path, mode='w', encoding='utf-8') # reg = 'chtb_3095.bn' reg = '(.*nw)*(.*bc)*(.*mz)*(.*bn)*(.*wb)*' # reg = '(.*nw)*(.*mz)*' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) OTHER = [']', '[', ')', '(', '<', '/', '>'] def strQ2B(ustring): """全角转半角""" rstring = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 12288: #全角空格直接转换 inside_code = 32 elif (inside_code >= 65281 and inside_code <= 65374): #全角字符(除空格)根据关系转化 inside_code -= 65248
def test_nombank_raw_fileid(self): root = FileSystemPathPointer(self.FILE_PATH) r = NombankCorpusReader(root, 'test_corpus_reader.json') assert r.raw( 'test_corpus_reader.json') == '{"test":"json", "number":5}\n'
if path_ and (os.path.isfile(path_) and path_.endswith('.zip')): try: return ZipFilePathPointer(path_, resource_name) except IOError: # resource not in zipfile continue # Is the path item a directory or is resource_name an absolute path? elif not path_ or os.path.isdir(path_): if zipfile is None: p = os.path.join(path_, url2pathname(resource_name)) if os.path.exists(p): if p.endswith('.gz'): return GzipFileSystemPathPointer(p) else: return FileSystemPathPointer(p) else: p = os.path.join(path_, url2pathname(zipfile)) if os.path.exists(p): try: return ZipFilePathPointer(p, zipentry) except IOError: # resource not in zipfile continue # Fallback: if the path doesn't include a zip file, then try # again, assuming that one of the path components is inside a # zipfile of the same name. if zipfile is None: pieces = resource_name.split('/') for i in range(len(pieces)):
def rules(normal_save_dir, mmbroken_dir, other_broken_dir, phrases_dir, value_error_dir): ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/all_data' # ctb_dir = '/home/lnn/Downloads/ctb_bracket' # ctb_dir = home_dir # ctb_dir = path.join(home_dir,'ctb_test') # reg = 'chtb_0040.nw' reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) statis = [0, 0, 0, 0] sum_broken_phrases = {} sum_mmbrokens = {} for fid in fileids: print(fid) normal_trees, mmbrokens, mmbroken_trees, other_brokens, broken_phrases, value_error, mmtext = analysis_v2( ctb_dir, fid) # break statis[0] += len(normal_trees) statis[1] += len(other_brokens) statis[2] += len(value_error) statis[3] += len(mmbroken_trees) # f=open('mmtext.txt',mode='a') # f.write('{}: \n'.format(fid)) # for line in mmtext: # f.write(' '.join(mm_out(line[0]))+'\n') # f.write(' '.join(mm_out(line[1]))+'\n') # f.write(' '.join(mm_out(line[2]))+'\n') # f.write(' '.join(mm_out(line[3]))+'\n') # f.write('\n') # f.write('\n\n') # f.close() for k, v in broken_phrases.items(): if sum_broken_phrases.get(k, 0) == 0: sum_broken_phrases[k] = v else: sum_broken_phrases[k] = sum_broken_phrases[k] + v for k, v in mmbrokens.items(): if sum_mmbrokens.get(k, 0) == 0: sum_mmbrokens[k] = v else: sum_mmbrokens[k] = sum_mmbrokens[k] + v if len(value_error) > 0: f = open(value_error_dir + '/' + fid, mode='w') for i in value_error: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(normal_trees) > 0: f = open(normal_save_dir + '/' + fid, mode='w') for i in normal_trees: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(mmbroken_trees) > 0: f = open(mmbroken_dir + '/' + fid, mode='w') for i in mmbroken_trees: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(other_brokens) > 0: f = open(other_broken_dir + '/' + fid, mode='w') for i in other_brokens: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(sum_broken_phrases) > 0: f = open(phrases_dir + '/broken_phrases.txt', mode='w') for k, v in sum_broken_phrases.items(): f.write('{} {}\n'.format(k, v)) f.close() if len(sum_mmbrokens) > 0: f = open(mmbroken_dir + '/mmbrokens.txt', mode='w') for k, v in sum_mmbrokens.items(): f.write('{} {}\n'.format(k, v)) f.close() print(statis)
nombank_nouns_file = 'nombank.1.0.words' frame_file_pattern = 'frames/.*\.xml' def fileid_xform_function(filename): result = re.sub(r'^wsj/', '', filename) # result = re.sub(r'^wsj/\d\d/', '', filename) # result = re.sub(r'\.mrg$', '', result) return result treebank = BracketParseCorpusReader(root=treebank_root, fileids=treebank_file_pattern, tagset='wsj', encoding='ascii') propbank = PropbankCorpusReader(root=FileSystemPathPointer(propbank_root), propfile=propbank_file, framefiles=frame_file_pattern, verbsfile=propbank_verbs_file, parse_fileid_xform=fileid_xform_function, parse_corpus=treebank) nombank = NombankCorpusReader(root=FileSystemPathPointer(nombank_root), nomfile=nombank_file, framefiles=frame_file_pattern, nounsfile=nombank_nouns_file, parse_fileid_xform=fileid_xform_function, parse_corpus=treebank)
def __init__(self, root, fileids, encoding=None, tag_mapping_function=None): """ @type root: L{PathPointer} or C{str} @param root: A path pointer identifying the root directory for this corpus. If a string is specified, then it will be converted to a L{PathPointer} automatically. @param fileids: A list of the files that make up this corpus. This list can either be specified explicitly, as a list of strings; or implicitly, as a regular expression over file paths. The absolute path for each file will be constructed by joining the reader's root to each file name. @param encoding: The default unicode encoding for the files that make up the corpus. C{encoding}'s value can be any of the following: - B{A string}: C{encoding} is the encoding name for all files. - B{A dictionary}: C{encoding[file_id]} is the encoding name for the file whose identifier is C{file_id}. If C{file_id} is not in C{encoding}, then the file contents will be processed using non-unicode byte strings. - B{A list}: C{encoding} should be a list of C{(regexp, encoding)} tuples. The encoding for a file whose identifier is C{file_id} will be the C{encoding} value for the first tuple whose C{regexp} matches the C{file_id}. If no tuple's C{regexp} matches the C{file_id}, the file contents will be processed using non-unicode byte strings. - C{None}: the file contents of all files will be processed using non-unicode byte strings. @param tag_mapping_function: A function for normalizing or simplifying the POS tags returned by the tagged_words() or tagged_sents() methods. """ # Convert the root to a path pointer, if necessary. if isinstance(root, basestring): m = re.match('(.*\.zip)/?(.*)$|', root) zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # If `fileids` is a regexp, then expand it. if isinstance(fileids, basestring): fileids = find_corpus_fileids(root, fileids) self._fileids = fileids """A list of the relative paths for the fileids that make up this corpus.""" self._root = root """The root directory for this corpus.""" # If encoding was specified as a list of regexps, then convert # it to a dictionary. if isinstance(encoding, list): encoding_dict = {} for fileid in self._fileids: for x in encoding: (regexp, enc) = x if re.match(regexp, fileid): encoding_dict[fileid] = enc break encoding = encoding_dict self._encoding = encoding """The default unicode encoding for the fileids that make up this corpus. If C{encoding} is C{None}, then the file contents are processed using byte strings (C{str}).""" self._tag_mapping_function = tag_mapping_function