Пример #1
0
def main(args):
    if args.log_file is not None:
        segmentation_logger.addHandler(file_handler(args.log_file))
    segmentation_logger.setLevel(args.log_level)

    ienc = args.ienc or args.enc
    oenc = args.oenc or args.enc
    segmenter = SEMModule(args.tokeniser_name, log_level=args.log_level)
    document = Document(os.path.basename(args.infile),
                        content=codecs.open(args.infile, "rU",
                                            ienc).read().replace(u"\r", u""))
    segmenter.process_document(document, log_level=args.log_level)
    tokens_spans = document.segmentation("tokens")
    sentence_spans = document.segmentation("sentences")
    joiner = (u"\n" if args.output_format == "vector" else u" ")
    content = document.content
    with codecs.open(args.outfile, "w", oenc) as O:
        for sentence in sentence_spans:
            sentence_token_spans = tokens_spans[sentence.lb:sentence.ub]
            sentence_tokens = [
                content[s.lb:s.ub] for s in sentence_token_spans
            ]
            O.write(joiner.join(sentence_tokens))
            if args.output_format == "vector":
                O.write(u"\n")
            O.write(u"\n")
Пример #2
0
 def process_document(self, document, encoding="utf-8", **kwargs):
     """
     Annotate document with Wapiti.
     
     Parameters
     ----------
     document : sem.storage.Document
         the input data. It is a document with only a content
     log_level : str or int
         the logging level
     log_file : str
         if not None, the file to log to (does not remove command-line
         logging).
     """
     
     start = time.time()
     
     if self._log_file is not None:
         wapiti_label_logger.addHandler(file_handler(self._log_file))
     wapiti_label_logger.setLevel(self._log_level)
     
     if self._field in document.corpus.fields:
         wapiti_label_logger.warn("field %s already exists in document, not annotating", self._field)
         
         tags = [[s[self._field] for s in sentence] for sentence in document.corpus]
         document.add_annotation_from_tags(tags, self._field, self._field)
     else:
         wapiti_label_logger.info("annotating document with %s field", self._field)
         
         self._label_document(document, encoding)
     
     laps = time.time() - start
     wapiti_label_logger.info('in %s', timedelta(seconds=laps))
Пример #3
0
def compile_dictionary(infile,
                       outfile,
                       kind="token",
                       ienc="UTF-8",
                       log_level=logging.CRITICAL,
                       log_file=None):
    if log_file is not None:
        compile_dictionary_logger.addHandler(file_handler(log_file))
    compile_dictionary_logger.setLevel(log_level)

    if kind not in _choices:
        raise RuntimeError("Invalid kind: {0}".format(kind))

    compile_dictionary_logger.info(
        u'compiling {0} dictionary from "{1}" to "{2}"'.format(
            kind, infile, outfile))

    try:
        dictionary_compile = _compile[kind]
    except KeyError:  # invalid kind asked
        compile_dictionary_logger.exception(
            "Invalid kind: {0}. Should be in: {1}".format(
                kind, u", ".join(_compile.keys())))
        raise

    pickle.dump(dictionary_compile(infile, ienc), open(outfile, "w"))

    compile_dictionary_logger.info(u"done")
Пример #4
0
def main(args):
    """
    Cleans a CoNLL-formatted file, removing fields at given indices.
    
    Parameters
    ----------
    args.infile : str
        the name of the file to clean.
    args.outfile : str
        the name of the output file, where some columns have been removed.
    args.ranges : str
        the fields to remove. Fields is a coma-separated list of indices
        or ranges of indices using a python format (ie: "lo:hi").
    """

    if args.log_file is not None:
        clean_info_logger.addHandler(file_handler(args.log_file))
    clean_info_logger.setLevel(args.log_level)

    ienc = args.ienc or args.enc
    oenc = args.oenc or args.enc

    allowed = ranges_to_set(args.ranges,
                            len(
                                codecs.open(args.infile, "rU",
                                            ienc).readline().strip().split()),
                            include_zero=True)
    max_abs = 0
    for element in allowed:
        element = abs(element) + (1 if element > 0 else 0)
        max_abs = max(max_abs, element)
    nelts = len(
        codecs.open(args.infile, "rU", ienc).readline().strip().split())

    if nelts < max_abs:
        clean_info_logger.error(
            u'asked to keep up to {0} field(s), yet only {1} are present in the "{2}"'
            .format(max_abs, nelts, args.infile))
        raise runtimeError(
            u'asked to keep up to {0} field(s), yet only {1} are present in the "{2}"'
            .format(max_abs, nelts, args.infile))

    clean_info_logger.info(u'cleaning "{0}"'.format(args.infile))
    clean_info_logger.info(u'keeping columns: {0}'.format(u", ".join(
        [str(s) for s in sorted(allowed)])))
    clean_info_logger.info(u'writing "{0}"'.format(args.outfile))

    with codecs.open(args.outfile, "w", oenc) as O:
        for line in codecs.open(args.infile, "rU", ienc):
            line = line.strip().split()
            if line != []:
                tokens = [line[i] for i in range(len(line)) if i in allowed]
                O.write(u"\t".join(tokens))
            O.write(u"\n")

    clean_info_logger.info(u'done')
Пример #5
0
    def process_document(self, document, **kwargs):
        """
        Updates the CoNLL-formatted corpus inside a document with various
        features.
        
        Parameters
        ----------
        document : sem.storage.Document
            the input data, contains an object representing CoNLL-formatted
            data. Each token is a dict which works like TSV.
        log_level : str or int
            the logging level
        log_file : str
            if not None, the file to log to (does not remove command-line
            logging).
        """

        start = time.time()

        if self._log_file is not None:
            enrich_logger.addHandler(file_handler(self._log_file))
        enrich_logger.setLevel(self._log_level)

        missing_fields = set([I.name for I in self.bentries + self.aentries
                              ]) - set(document.corpus.fields)

        if len(missing_fields) > 0:
            raise ValueError("Missing fields in input corpus: {0}".format(
                u",".join(sorted(missing_fields))))

        enrich_logger.info(u'enriching file "%s"', document.name)

        new_fields = [
            feature.name for feature in self.features if feature.display
        ]
        document.corpus.fields += new_fields
        nth = 0
        for i, p in enumerate(document.corpus):
            for feature in self.features:
                if feature.is_sequence:
                    for i, value in enumerate(feature(p)):
                        p[i][feature.name] = value
                else:
                    for i in range(len(p)):
                        p[i][feature.name] = feature(p, i)
                        if feature.is_boolean:
                            p[i][feature.name] = int(p[i][feature.name])
                        elif p[i][feature.name] is None:
                            p[i][feature.name] = feature.default()
            nth += 1
            if (0 == nth % 1000):
                enrich_logger.debug(u'%i sentences enriched', nth)
        enrich_logger.debug(u'%i sentences enriched', nth)

        laps = time.time() - start
        enrich_logger.info(u"done in %s", timedelta(seconds=laps))
Пример #6
0
    def process_document(self, document, **kwargs):
        """
        Updates a document with various segmentations and creates
        an sem.corpus (CoNLL-formatted data) using field argument as index.
        
        Parameters
        ----------
        document : sem.storage.Document
            the input data. It is a document with only a content
        log_level : str or int
            the logging level
        log_file : str
            if not None, the file to log to (does not remove command-line
            logging).
        """
        
        start = time.time()

        if self._log_file is not None:
            segmentation_logger.addHandler(file_handler(self._log_file))
        segmentation_logger.setLevel(self._log_level)

        current_tokeniser = self._tokeniser

        segmentation_logger.debug(u'segmenting "%s" content', document.name)

        content = document.content
        if document.metadata("MIME") == "text/html":
            content = strip_html(content, keep_offsets=True)
        
        do_segmentation = document.segmentation("tokens") is None or document.segmentation("sentences") is None or document.segmentation("paragraphs") is None
        if do_segmentation:
            try:
                token_spans = current_tokeniser.word_spans(content)
            except NotImplementedError:
                token_spans = current_tokeniser.bounds2spans(current_tokeniser.word_bounds(content))
            sentence_spans = current_tokeniser.bounds2spans(current_tokeniser.sentence_bounds(content, token_spans))
            paragraph_spans = current_tokeniser.bounds2spans(current_tokeniser.paragraph_bounds(content, sentence_spans, token_spans))
        else:
            segmentation_logger.info(u'{0} already has segmenation, not computing'.format(document.name))
            token_spans = document.segmentation("tokens").spans
            sentence_spans = document.segmentation("sentences").spans
            paragraph_spans = document.segmentation("paragraphs").spans
        segmentation_logger.info(u'"{0}" segmented in {1} sentences, {2} tokens'.format(document.name, len(sentence_spans), len(token_spans)))
        
        if document.segmentation("tokens") is None:
            document.add_segmentation(Segmentation("tokens", spans=token_spans))
        if document.segmentation("sentences") is None:
            document.add_segmentation(Segmentation("sentences", reference=document.segmentation("tokens"), spans=sentence_spans))
        if document.segmentation("paragraphs") is None:
            document.add_segmentation(Segmentation("paragraphs", reference=document.segmentation("sentences"), spans=paragraph_spans))
        if len(document.corpus) == 0:
            document.corpus.from_segmentation(document.content, document.segmentation("tokens"), document.segmentation("sentences"))

        laps = time.time() - start
        segmentation_logger.info(u'in {0}'.format(timedelta(seconds=laps)))
Пример #7
0
def main(args):
    """
    Takes a CoNLL-formatted file and write another CoNLL-formatted file
    with additional features in it.
    
    Parameters
    ----------
    infile : str
        the CoNLL-formatted input file.
    outfile : str
        the CoNLL-formatted output file.
    mdl : str
        the wapiti model file.
    log_level : str or int
        the logging level.
    log_file : str
        if not None, the file to log to (does not remove command-line
        logging).
    """

    start = time.time()

    if args.log_file is not None:
        tagging_logger.addHandler(file_handler(args.log_file))
    tagging_logger.setLevel(args.log_level)

    infile = args.infile
    outfile = args.outfile
    ienc = args.ienc or args.enc
    oenc = args.oenc or args.enc
    annotator = SEMModule(**vars(args))

    length = -1
    fields = None
    for sentence in Reader(infile, ienc):
        fields = fields or [unicode(i) for i in range(len(sentence[0]))]
        if length == -1:
            length = len(fields)
        if length != len(sentence[0]):
            raise ValueError(
                u"%s has inconsistent number of columns, found %i and %i" %
                (infile, length, len(sentence[0])))

    document = conll_file(infile, fields, fields[0], encoding=ienc)

    annotator.process_document(document)

    exporter = CoNLLExporter()

    exporter.document_to_file(document, None, outfile, encoding=oenc)

    laps = time.time() - start
    tagging_logger.info("done in %s", timedelta(seconds=laps))
Пример #8
0
def main(args):
    """
    Takes a CoNLL-formatted file and write another CoNLL-formatted file
    with additional features in it.
    
    Parameters
    ----------
    infile : str
        the CoNLL-formatted input file.
    infofile : str
        the XML file containing the different features.
    mode : str
        the mode to use for infofile. Some inputs may only be present in
        a particular mode. For example, the output tag is only available
        in "train" mode.
    log_level : str or int
        the logging level.
    log_file : str
        if not None, the file to log to (does not remove command-line
        logging).
    """

    start = time.time()

    if args.log_file is not None:
        enrich_logger.addHandler(file_handler(args.log_file))
    enrich_logger.setLevel(args.log_level)
    enrich_logger.info('parsing enrichment file "%s"' % args.infofile)

    informations = Informations(path=args.infofile, mode=args.mode)

    enrich_logger.debug('enriching file "%s"' % args.infile)

    bentries = [entry.name for entry in informations.bentries]
    aentries = [entry.name for entry in informations.aentries]
    features = [
        feature.name for feature in informations.features if feature.display
    ]
    with KeyWriter(args.outfile, args.oenc or args.enc,
                   bentries + features + aentries) as O:
        nth = 0
        for p in informations.enrich(
                KeyReader(args.infile, args.ienc or args.enc,
                          bentries + aentries)):
            O.write_p(p)
            nth += 1
            if (0 == nth % 1000):
                enrich_logger.debug('%i sentences enriched' % nth)
        enrich_logger.debug('%i sentences enriched' % nth)

    laps = time.time() - start
    enrich_logger.info("done in %s", timedelta(seconds=laps))
Пример #9
0
def main(args):
    """
    Takes a CoNLL-formatted file and write another CoNLL-formatted file
    with additional features in it.
    
    Parameters
    ----------
    infile : str
        the CoNLL-formatted input file.
    infofile : str
        the XML file containing the different features.
    mode : str
        the mode to use for infofile. Some inputs may only be present in
        a particular mode. For example, the output tag is only available
        in "train" mode.
    log_level : str or int
        the logging level.
    log_file : str
        if not None, the file to log to (does not remove command-line
        logging).
    """

    start = time.time()

    if args.log_file is not None:
        enrich_logger.addHandler(file_handler(args.log_file))
    enrich_logger.setLevel(args.log_level)
    enrich_logger.info(u'parsing enrichment file "%s"', args.infofile)

    processor = SEMModule(path=args.infofile, mode=args.mode)

    enrich_logger.debug(u'enriching file "%s"', args.infile)

    bentries = [entry.name for entry in processor.bentries]
    aentries = [entry.name for entry in processor.aentries]
    features = [
        feature.name for feature in processor.features if feature.display
    ]
    document = from_conll(args.infile,
                          bentries + aentries, (bentries + aentries)[0],
                          encoding=args.ienc or args.enc)

    processor.process_document(document)
    with KeyWriter(args.outfile, args.oenc or args.enc,
                   bentries + features + aentries) as O:
        for p in document.corpus:
            O.write_p(p)

    laps = time.time() - start
    enrich_logger.info(u"done in %s", timedelta(seconds=laps))
Пример #10
0
    def process_document(self, document, **kwargs):
        """
        Updates the CoNLL-formatted corpus inside a document with various
        features.
        
        Parameters
        ----------
        document : sem.storage.Document
            the input data, contains an object representing CoNLL-formatted
            data. Each token is a dict which works like TSV.
        log_level : str or int
            the logging level
        log_file : str
            if not None, the file to log to (does not remove command-line
            logging).
        """

        start = time.time()

        if self._log_file is not None:
            enrich_logger.addHandler(file_handler(self._log_file))
        enrich_logger.setLevel(self._log_level)

        informations = self._informations
        missing_fields = set([
            I.name for I in informations.bentries + informations.aentries
        ]) - set(document.corpus.fields)

        if len(missing_fields) > 0:
            raise ValueError("Missing fields in input corpus: %s" %
                             u",".join(sorted(missing_fields)))

        enrich_logger.debug('enriching file "%s"' % document.name)

        new_fields = [
            feature.name for feature in informations.features
            if feature.display
        ]
        document.corpus.fields += new_fields
        nth = 0
        for i, sentence in enumerate(informations.enrich(document.corpus)):
            nth += 1
            if (0 == nth % 1000):
                enrich_logger.debug('%i sentences enriched' % nth)
        enrich_logger.debug('%i sentences enriched' % nth)

        laps = time.time() - start
        enrich_logger.info("done in %s" % timedelta(seconds=laps))
Пример #11
0
    def __init__(self,
                 annotator,
                 field,
                 log_level="WARNING",
                 log_file=None,
                 *args,
                 **kwargs):
        super(SEMModule, self).__init__(log_level=log_level,
                                        log_file=log_file,
                                        **kwargs)

        if self._log_file is not None:
            tagging_logger.addHandler(file_handler(self._log_file))
        tagging_logger.setLevel(self._log_level)

        self._annotator = sem.annotators.get_annotator(annotator)(field, *args,
                                                                  **kwargs)
Пример #12
0
    def process_document(self,
                         document,
                         outfile=sys.stdout,
                         output_encoding="utf-8",
                         **kwargs):
        start = time.time()

        if self._log_file is not None:
            export_logger.addHandler(file_handler(self._log_file))
        export_logger.setLevel(self._log_level)

        export_logger.debug('setting name/column couples for exportation')

        oenc = kwargs.get("output-encoding", "utf-8")
        pos_column = self._pos_column
        chunk_column = self._chunk_column
        ner_column = self._ner_column

        couples = {}
        if "word" in document.corpus.fields:
            couples["token"] = "word"
        elif "token" in document.corpus.fields:
            couples["token"] = "token"

        if pos_column:
            couples["pos"] = pos_column
            export_logger.debug('POS column is %s' % pos_column)
        if chunk_column:
            couples["chunking"] = chunk_column
            export_logger.debug('chunking column is %s' % chunk_column)
        if ner_column:
            couples["ner"] = ner_column
            export_logger.debug('NER column is %s' % ner_column)

        export_logger.debug('exporting document to %s format' %
                            self._exporter.extension)

        self._exporter.document_to_file(document,
                                        couples,
                                        outfile,
                                        encoding=output_encoding)

        laps = time.time() - start
        export_logger.info('done in %s' % (timedelta(seconds=laps)))
Пример #13
0
 def __init__(self, exporter, log_level="WARNING", log_file=None, lang="fr", lang_style="default.css", pos_column=None, chunk_column=None, ner_column=None, **kwargs):
     super(SEMModule, self).__init__(log_level=log_level, log_file=log_file, **kwargs)
 
     if log_file is not None:
         export_logger.addHandler(file_handler(log_file))
     export_logger.setLevel(log_level)
     
     self._lang = lang
     self._lang_style = lang_style
     self._pos_column = pos_column
     self._chunk_column = chunk_column
     self._ner_column = ner_column
     if is_string(exporter):
         export_logger.info(u'getting exporter {0}'.format(exporter))
         Exporter = get_exporter(exporter)
         self._exporter = Exporter(lang=self._lang, lang_style=self._lang_style)
     else:
         export_logger.info(u'using loaded exporter')
         self._exporter = exporter
Пример #14
0
    def process_document(self, document, **kwargs):
        """
        Cleans the sem.storage.corpus of a document, removing unwanted fields.
        
        Parameters
        ----------
        document : sem.storage.Document
            the document containing the corpus to clean.
        ranges : str or list of int or list of str
            if str: fields to remove will be induced
            if list of int: each element in the list is the index of a field
            to remove in corpus.fields
            if list of string: the list of fields to remove
        """

        start = time.time()

        if self._log_file is not None:
            clean_info_logger.addHandler(file_handler(self._log_file))
        clean_info_logger.setLevel(self._log_level)

        clean_info_logger.info(u'cleaning document')

        allowed = set(self._allowed)
        fields = set(field for field in document.corpus.fields)
        to_remove = fields - allowed
        document.corpus.fields = self._allowed[:]

        if len(allowed - fields) > 0:
            clean_info_logger.warn(
                u"the following fields are not present in document, this might cause an error sometime later: %s",
                u", ".join(allowed - fields))

        indices = [entry.index for entry in document.corpus.fields]
        for i in range(len(document.corpus.sentences)):
            for j in range(len(document.corpus.sentences[i])):
                document.corpus.sentences[i][j] = dict(
                    (a, document.corpus.sentences[i][j][a]) for a in allowed)

        laps = time.time() - start
        clean_info_logger.info(u'done in {0}'.format(timedelta(seconds=laps)))
Пример #15
0
    def process_document(self, document, **kwargs):
        """
        Updates a document with various segmentations and creates
        an sem.corpus (CoNLL-formatted data) using field argument as index.
        
        Parameters
        ----------
        document : sem.storage.Document
            the input data. It is a document with only a content
        log_level : str or int
            the logging level
        log_file : str
            if not None, the file to log to (does not remove command-line
            logging).
        """

        start = time.time()

        if self._log_file is not None:
            map_annotations_logger.addHandler(file_handler(self._log_file))
        map_annotations_logger.setLevel(self._log_level)

        ref_annotation = document.annotation(self._annotation_name)
        ref_annotations = ref_annotation.annotations
        values = set([a.value for a in ref_annotations])
        new_annotations = [
            Tag(annotation.lb, annotation.ub,
                self._mapping.get(annotation.value, annotation.value))
            for annotation in ref_annotations
            if self._mapping.get(annotation.value, None) != u""
        ]

        document.add_annotation(
            Annotation(self._annotation_name,
                       reference=ref_annotation.reference,
                       annotations=new_annotations))

        laps = time.time() - start
        map_annotations_logger.info('in %s' % (timedelta(seconds=laps)))
Пример #16
0
def main(args):
    start = time.time()

    infile = args.infile
    outfile = args.outfile
    exporter_name = args.exporter_name
    lang = args.lang
    lang_style = args.lang_style
    import_options = args.import_options or {}
    ienc = args.ienc or args.enc
    oenc = args.oenc or args.enc
    log_file = args.log_file
    log_level = args.log_level

    pos_column = args.pos_column
    chunk_column = args.chunk_column
    ner_column = args.ner_column
    couples = {}
    if pos_column:
        couples["pos"] = pos_column
        export_logger.debug('POS column is %s' % pos_column)
    if chunk_column:
        couples["chunking"] = chunk_column
        export_logger.debug('chunking column is %s' % chunk_column)
    if ner_column:
        couples["ner"] = ner_column
        export_logger.debug('NER column is %s' % ner_column)

    if log_file is not None:
        export_logger.addHandler(file_handler(log_file))
    export_logger.setLevel(log_level)

    if type(exporter_name) in (str, unicode):
        export_logger.info('getting exporter %s' % (exporter_name))
        Exporter = get_exporter(exporter_name)
        exporter = Exporter(lang=lang, lang_style=lang_style)
    else:
        export_logger.info('using loaded exporter')
        exporter = exporter_name

    if type(import_options) in (list, ):  # list from argparse
        options = {}
        for option in import_options:
            key, value = option.split(u"=", 1)
            try:
                value = sem.misc.str2bool(value)
            except ValueError:
                pass
            options[key] = value
        options["encoding"] = ienc
    else:
        options = import_options

    infile_is_str = type(infile) in (str, unicode)
    if infile_is_str:
        export_logger.info('loading input file')
        document = sem.importers.load(infile, logger=export_logger, **options)
    else:
        export_logger.info('using input document')
        document = infile

    export_logger.debug('exporting document %s' % document.name)
    exporter.document_to_file(document,
                              couples,
                              outfile,
                              encoding=oenc,
                              logger=export_logger)

    laps = time.time() - start
    export_logger.info('done in %s' % (timedelta(seconds=laps)))