Exemplo n.º 1
0
 def __init__(self, root, zipfile, fileids):
     if isinstance(root, basestring):
         root = FileSystemPathPointer(root)
     elif not isinstance(root, PathPointer): 
         raise TypeError('CorpusReader: expected a string or a PathPointer')
     
     # convert to a ZipFilePathPointer
     root = ZipFilePathPointer(root.join(zipfile))
     
     CorpusReader.__init__(self, root, fileids)
     
     self._parse_char_replacements()
Exemplo n.º 2
0
    def __init__(self,
                 root,
                 fileids=DOC_PATTERN,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8',
                 **kwargs):
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)

        TwitterCorpusReader.__init__(self, root, fileids, encoding)

        if isinstance(root,
                      string_types) and not isinstance(root, PathPointer):
            m = re.match('(.*\.gz)/?(.*\.zip)/?(.*)$|',
                         root)  #'(.*\.zip)/?(.*\.gz)/?(.*)$|'
            gzipfile, zipfile, zipentry = m.groups()
            if zipfile:
                root = ZipFilePathPointer(zipfile, zipentry)
            elif gzipfile:
                root = ZipFilePathPointer(gzipfile, zipentry)
            else:
                root = FileSystemPathPointer(root)
        elif not isinstance(root, PathPointer):
            raise TypeError('CorpusReader: expected a string or a PathPointer')

        self._root = root
        self.current_doc = []
Exemplo n.º 3
0
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus)
        logging.info('Initialize PropBank reader.')

        if with_doc:
            self.wsj_treebank = BracketParseCorpusReader(
                root=params.wsj_path,
                fileids=params.wsj_file_pattern,
                tagset='wsj',
                encoding='ascii')

            logging.info('Found {} treebank files.'.format(
                len(self.wsj_treebank.fileids())))

        self.propbank = PropbankCorpusReader(
            root=FileSystemPathPointer(params.root),
            propfile=params.propfile,
            framefiles=params.frame_files,
            verbsfile=params.verbs_file,
        )

        self.propbank_annos = defaultdict(list)
        logging.info("Loading PropBank Data.")
        for inst in self.propbank.instances():
            docid = inst.fileid.split('/')[-1]
            self.propbank_annos[docid].append(inst)

        self.stats = {
            'predicate_count': 0,
            'argument_count': 0,
        }
Exemplo n.º 4
0
def ctb_clear():
    ctb_dir = path.join(home_dir, 'normal_ctb_test')
    reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*'
    # reg='.*dev'
    ctb_dir = FileSystemPathPointer(ctb_dir)
    fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)
    for fid in fileids:
        f1 = open('normal_ctb_test/' + fid, mode='r')
        f2 = open('for_clearnlp/' + fid, mode='w')
        for line in f1.readlines():
            if line.find('<S>') >= 0 or line.find('</S>') >= 0:
                continue
            f2.write(line)
        f1.close()
        f2.close()
Exemplo n.º 5
0
def static_dp():
    ctb_dir = path.join(home_dir, 'for_clearnlp')
    # reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*'
    reg = '(.*dep)*'
    ctb_dir = FileSystemPathPointer(ctb_dir)
    fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)
    ct = 0
    for fid in fileids:
        f2 = open('for_clearnlp/' + fid, mode='r')
        for line in f2.readlines():
            if line == '\n':
                ct += 1

        f2.close()
    print(ct)
Exemplo n.º 6
0
def read_knbc(train_file, test_file, reference_file):

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    knbc = LazyCorpusLoader('knbc/corpus1',
                            KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort),
                            encoding='euc-jp')

    sentences = knbc.sents()

    write_train(sentences[0:4000], train_file)
    write_test(sentences[4000:-1], test_file)
    write_reference(sentences[4000:-1], reference_file)
Exemplo n.º 7
0
    def abspaths(self,
                 fileids=None,
                 include_encoding=False,
                 include_fileid=False):
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, string_types):
            fileids = [fileids]

        paths = [FileSystemPathPointer(self._root.join(f)) for f in fileids]

        if include_encoding and include_fileid:
            return list(
                zip(paths, [self.encoding(f) for f in fileids], fileids))
        elif include_fileid:
            return list(zip(paths, fileids))
        elif include_encoding:
            return list(zip(paths, [self.encoding(f) for f in fileids]))
        else:
            return paths
Exemplo n.º 8
0
    def __init__(self, config_path):
        conf = load_file_config(config_path)
        logging.info(json.dumps(conf, indent=2))

        params = GCDataSet.GCConfig(config=conf)
        super().__init__(params)

        wsj_treebank = BracketParseCorpusReader(
            root=params.wsj_path,
            fileids=params.wsj_file_pattern,
            tagset='wsj',
            encoding='ascii')

        self.nombank = NombankCorpusReader(
            root=FileSystemPathPointer(params.nombank_path),
            nomfile=params.nomfile,
            framefiles=params.frame_file_pattern,
            nounsfile=params.nombank_nouns_file,
            parse_fileid_xform=lambda s: s[4:],
            parse_corpus=wsj_treebank)
Exemplo n.º 9
0
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus, with_doc)

        self.wsj_treebank = BracketParseCorpusReader(
            root=params.wsj_path,
            fileids=params.wsj_file_pattern,
            tagset="wsj",
            encoding="ascii",
        )

        logging.info("Found {} treebank files.".format(
            len(self.wsj_treebank.fileids())))

        self.nombank = NombankCorpusReader(
            root=FileSystemPathPointer(params.nombank_path),
            nomfile=params.nomfile,
            framefiles=params.frame_file_pattern,
            nounsfile=params.nombank_nouns_file,
            parse_fileid_xform=lambda s: s[4:],
            parse_corpus=self.wsj_treebank,
        )

        logging.info("Loading G&C annotations.")
        self.gc_annos = self.load_gc_annotations()
        num_gc_preds = sum(
            [len(preds) for (d, preds) in self.gc_annos.items()])
        logging.info(f"Loaded {num_gc_preds} predicates")

        logging.info("Loading Nombank annotations")
        self.nombank_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            docid = nb_instance.fileid.split("/")[-1]
            self.nombank_annos[docid].append(nb_instance)

        self.stats = {
            "target_pred_count": Counter(),
            "predicates_with_implicit": Counter(),
            "implicit_slots": Counter(),
        }

        self.stat_dir = params.stat_dir
Exemplo n.º 10
0
    def __init__(self, root, fileids, encoding='utf8', tagset=None):
        """
        :type root: PathPointer or str
        :param root: A path pointer identifying the root directory for
            this corpus.  If a string is specified, then it will be
            converted to a ``PathPointer`` automatically.
        :param fileids: A list of the files that make up this corpus.
            This list can either be specified explicitly, as a list of
            strings; or implicitly, as a regular expression over file
            paths.  The absolute path for each file will be constructed
            by joining the reader's root to each file name.
        :param encoding: The default unicode encoding for the files
            that make up the corpus.  The value of ``encoding`` can be any
            of the following:
            - A string: ``encoding`` is the encoding name for all files.
            - A dictionary: ``encoding[file_id]`` is the encoding
              name for the file whose identifier is ``file_id``.  If
              ``file_id`` is not in ``encoding``, then the file
              contents will be processed using non-unicode byte strings.
            - A list: ``encoding`` should be a list of ``(regexp, encoding)``
              tuples.  The encoding for a file whose identifier is ``file_id``
              will be the ``encoding`` value for the first tuple whose
              ``regexp`` matches the ``file_id``.  If no tuple's ``regexp``
              matches the ``file_id``, the file contents will be processed
              using non-unicode byte strings.
            - None: the file contents of all files will be
              processed using non-unicode byte strings.
        :param tagset: The name of the tagset used by this corpus, to be used
              for normalizing or converting the POS tags returned by the
              tagged_...() methods.
        """
        # Convert the root to a path pointer, if necessary.
        if isinstance(root, compat.string_types) and not isinstance(root, PathPointer):
            m = re.match('(.*\.zip)/?(.*)$|', root)
            zipfile, zipentry = m.groups()
            if zipfile:
                root = ZipFilePathPointer(zipfile, zipentry)
            else:
                root = FileSystemPathPointer(root)
        elif not isinstance(root, PathPointer):
            raise TypeError('CorpusReader: expected a string or a PathPointer')

        # If `fileids` is a regexp, then expand it.
        if isinstance(fileids, compat.string_types):
            fileids = find_corpus_fileids(root, fileids)

        self._fileids = fileids
        """A list of the relative paths for the fileids that make up
        this corpus."""

        self._root = root
        """The root directory for this corpus."""

        # If encoding was specified as a list of regexps, then convert
        # it to a dictionary.
        if isinstance(encoding, list):
            encoding_dict = {}
            for fileid in self._fileids:
                for x in encoding:
                    (regexp, enc) = x
                    if re.match(regexp, fileid):
                        encoding_dict[fileid] = enc
                        break
            encoding = encoding_dict

        self._encoding = encoding
        """The default unicode encoding for the fileids that make up
           this corpus.  If ``encoding`` is None, then the file
           contents are processed using byte strings."""
        self._tagset = tagset
Exemplo n.º 11
0
wsj_treebank = BracketParseCorpusReader(
    root=cfg.wsj_root,
    fileids=cfg.wsj_file_pattern,
    tagset='wsj',
    encoding='ascii'
)


def fileid_xform_function(fileid):
    # result = re.sub(r'^wsj/', '', fileid)
    # return result
    return fileid[4:]


propbank = PropbankCorpusReader(
    root=FileSystemPathPointer(cfg.propbank_root),
    propfile=cfg.propbank_file,
    framefiles=cfg.frame_file_pattern,
    verbsfile=cfg.propbank_verbs_file,
    parse_fileid_xform=fileid_xform_function,
    parse_corpus=wsj_treebank
)

nombank = NombankCorpusReader(
    root=FileSystemPathPointer(cfg.nombank_root),
    nomfile=cfg.nombank_file,
    framefiles=cfg.frame_file_pattern,
    nounsfile=cfg.nombank_nouns_file,
    parse_fileid_xform=fileid_xform_function,
    parse_corpus=wsj_treebank
)
Exemplo n.º 12
0
from nltk.data import FileSystemPathPointer

home_dir = path.join(path.dirname(__file__), './')
import re
# ctb_dir = '/home/lnn/Downloads/ctb_test'
ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/out_paper'
# ctb_dir = '/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/normal_ctb_test'
# ctb_dir = '/home/nana/Documents/pycharmforlinux/upenn_transfer/normal_ctb_test_v1'

ctb_path = path.join(ctb_dir, 'ctb.secondtest.clean')
counts = 0
fc = open(ctb_path, mode='w', encoding='utf-8')
# reg = 'chtb_3095.bn'
reg = '(.*nw)*(.*bc)*(.*mz)*(.*bn)*(.*wb)*'
# reg = '(.*nw)*(.*mz)*'
ctb_dir = FileSystemPathPointer(ctb_dir)
fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)

OTHER = [']', '[', ')', '(', '<', '/', '>']


def strQ2B(ustring):
    """全角转半角"""
    rstring = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:  #全角空格直接转换
            inside_code = 32
        elif (inside_code >= 65281 and inside_code <= 65374):  #全角字符(除空格)根据关系转化
            inside_code -= 65248
Exemplo n.º 13
0
 def test_nombank_raw_fileid(self):
     root = FileSystemPathPointer(self.FILE_PATH)
     r = NombankCorpusReader(root, 'test_corpus_reader.json')
     assert r.raw(
         'test_corpus_reader.json') == '{"test":"json", "number":5}\n'
Exemplo n.º 14
0
        if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
            try:
                return ZipFilePathPointer(path_, resource_name)
            except IOError:
                # resource not in zipfile
                continue

        # Is the path item a directory or is resource_name an absolute path?
        elif not path_ or os.path.isdir(path_):
            if zipfile is None:
                p = os.path.join(path_, url2pathname(resource_name))
                if os.path.exists(p):
                    if p.endswith('.gz'):
                        return GzipFileSystemPathPointer(p)
                    else:
                        return FileSystemPathPointer(p)
            else:
                p = os.path.join(path_, url2pathname(zipfile))
                if os.path.exists(p):
                    try:
                        return ZipFilePathPointer(p, zipentry)
                    except IOError:
                        # resource not in zipfile
                        continue

    # Fallback: if the path doesn't include a zip file, then try
    # again, assuming that one of the path components is inside a
    # zipfile of the same name.
    if zipfile is None:
        pieces = resource_name.split('/')
        for i in range(len(pieces)):
Exemplo n.º 15
0
def rules(normal_save_dir, mmbroken_dir, other_broken_dir, phrases_dir,
          value_error_dir):
    ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/all_data'
    # ctb_dir = '/home/lnn/Downloads/ctb_bracket'
    # ctb_dir = home_dir
    # ctb_dir = path.join(home_dir,'ctb_test')
    # reg = 'chtb_0040.nw'
    reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*'
    ctb_dir = FileSystemPathPointer(ctb_dir)
    fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)
    statis = [0, 0, 0, 0]
    sum_broken_phrases = {}
    sum_mmbrokens = {}
    for fid in fileids:
        print(fid)
        normal_trees, mmbrokens, mmbroken_trees, other_brokens, broken_phrases, value_error, mmtext = analysis_v2(
            ctb_dir, fid)
        # break
        statis[0] += len(normal_trees)
        statis[1] += len(other_brokens)
        statis[2] += len(value_error)
        statis[3] += len(mmbroken_trees)
        # f=open('mmtext.txt',mode='a')
        # f.write('{}: \n'.format(fid))
        # for line in mmtext:
        #     f.write(' '.join(mm_out(line[0]))+'\n')
        #     f.write(' '.join(mm_out(line[1]))+'\n')
        #     f.write(' '.join(mm_out(line[2]))+'\n')
        #     f.write(' '.join(mm_out(line[3]))+'\n')
        #     f.write('\n')
        # f.write('\n\n')
        # f.close()
        for k, v in broken_phrases.items():
            if sum_broken_phrases.get(k, 0) == 0:
                sum_broken_phrases[k] = v
            else:
                sum_broken_phrases[k] = sum_broken_phrases[k] + v
        for k, v in mmbrokens.items():
            if sum_mmbrokens.get(k, 0) == 0:
                sum_mmbrokens[k] = v
            else:
                sum_mmbrokens[k] = sum_mmbrokens[k] + v
        if len(value_error) > 0:
            f = open(value_error_dir + '/' + fid, mode='w')
            for i in value_error:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')

            f.close()

        if len(normal_trees) > 0:
            f = open(normal_save_dir + '/' + fid, mode='w')
            for i in normal_trees:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')
            f.close()
        if len(mmbroken_trees) > 0:
            f = open(mmbroken_dir + '/' + fid, mode='w')
            for i in mmbroken_trees:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')
            f.close()
        if len(other_brokens) > 0:
            f = open(other_broken_dir + '/' + fid, mode='w')
            for i in other_brokens:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')
            f.close()

    if len(sum_broken_phrases) > 0:
        f = open(phrases_dir + '/broken_phrases.txt', mode='w')
        for k, v in sum_broken_phrases.items():
            f.write('{} {}\n'.format(k, v))
        f.close()
    if len(sum_mmbrokens) > 0:
        f = open(mmbroken_dir + '/mmbrokens.txt', mode='w')
        for k, v in sum_mmbrokens.items():
            f.write('{} {}\n'.format(k, v))

        f.close()

    print(statis)
Exemplo n.º 16
0
nombank_nouns_file = 'nombank.1.0.words'

frame_file_pattern = 'frames/.*\.xml'


def fileid_xform_function(filename):
    result = re.sub(r'^wsj/', '', filename)
    # result = re.sub(r'^wsj/\d\d/', '', filename)
    # result = re.sub(r'\.mrg$', '', result)
    return result


treebank = BracketParseCorpusReader(root=treebank_root,
                                    fileids=treebank_file_pattern,
                                    tagset='wsj',
                                    encoding='ascii')

propbank = PropbankCorpusReader(root=FileSystemPathPointer(propbank_root),
                                propfile=propbank_file,
                                framefiles=frame_file_pattern,
                                verbsfile=propbank_verbs_file,
                                parse_fileid_xform=fileid_xform_function,
                                parse_corpus=treebank)

nombank = NombankCorpusReader(root=FileSystemPathPointer(nombank_root),
                              nomfile=nombank_file,
                              framefiles=frame_file_pattern,
                              nounsfile=nombank_nouns_file,
                              parse_fileid_xform=fileid_xform_function,
                              parse_corpus=treebank)
Exemplo n.º 17
0
    def __init__(self,
                 root,
                 fileids,
                 encoding=None,
                 tag_mapping_function=None):
        """
        @type root: L{PathPointer} or C{str}
        @param root: A path pointer identifying the root directory for
            this corpus.  If a string is specified, then it will be
            converted to a L{PathPointer} automatically.
        @param fileids: A list of the files that make up this corpus.
            This list can either be specified explicitly, as a list of
            strings; or implicitly, as a regular expression over file
            paths.  The absolute path for each file will be constructed
            by joining the reader's root to each file name.
        @param encoding: The default unicode encoding for the files
            that make up the corpus.  C{encoding}'s value can be any
            of the following:
            
              - B{A string}: C{encoding} is the encoding name for all
                files.
              - B{A dictionary}: C{encoding[file_id]} is the encoding
                name for the file whose identifier is C{file_id}.  If
                C{file_id} is not in C{encoding}, then the file
                contents will be processed using non-unicode byte
                strings.
              - B{A list}: C{encoding} should be a list of C{(regexp,
                encoding)} tuples.  The encoding for a file whose
                identifier is C{file_id} will be the C{encoding} value
                for the first tuple whose C{regexp} matches the
                C{file_id}.  If no tuple's C{regexp} matches the
                C{file_id}, the file contents will be processed using
                non-unicode byte strings.
              - C{None}: the file contents of all files will be
                processed using non-unicode byte strings.
        @param tag_mapping_function: A function for normalizing or
                simplifying the POS tags returned by the tagged_words()
                or tagged_sents() methods.
        """
        # Convert the root to a path pointer, if necessary.
        if isinstance(root, basestring):
            m = re.match('(.*\.zip)/?(.*)$|', root)
            zipfile, zipentry = m.groups()
            if zipfile:
                root = ZipFilePathPointer(zipfile, zipentry)
            else:
                root = FileSystemPathPointer(root)
        elif not isinstance(root, PathPointer):
            raise TypeError('CorpusReader: expected a string or a PathPointer')

        # If `fileids` is a regexp, then expand it.
        if isinstance(fileids, basestring):
            fileids = find_corpus_fileids(root, fileids)

        self._fileids = fileids
        """A list of the relative paths for the fileids that make up
        this corpus."""

        self._root = root
        """The root directory for this corpus."""

        # If encoding was specified as a list of regexps, then convert
        # it to a dictionary.
        if isinstance(encoding, list):
            encoding_dict = {}
            for fileid in self._fileids:
                for x in encoding:
                    (regexp, enc) = x
                    if re.match(regexp, fileid):
                        encoding_dict[fileid] = enc
                        break
            encoding = encoding_dict

        self._encoding = encoding
        """The default unicode encoding for the fileids that make up
           this corpus.  If C{encoding} is C{None}, then the file
           contents are processed using byte strings (C{str})."""
        self._tag_mapping_function = tag_mapping_function