Пример #1
0
    def parse(self, text):
        result = {}
        doc = None
        try:
            text = urllib.unquote(text)
            text = text.decode('utf-8')
            doc = lf.parseText(text, self.english)

        except Exception as ee:
            logging.info(
                'error in utf-8 decode - (will attempt latin-1 decode)')
            logging.info(ee)
            doc = lf.parseText(text.decode('latin-1'), self.english)

        if doc is not None:
            try:
                functions = [
                    lf.tagSentences, lf.tagPOSTags, lf.tagNamedEntities,
                    lf.tagNounChunks
                ]
                lf.run_functions(doc, result, *functions)

                if 'named_entities' in result:
                    return json.dumps(result['named_entities'])
                else:
                    raise NLPException('No named entities found.')
            except Exception as e:
                logging.error("Error in NLP: %s" % e)
                return json.dumps({})
        else:
            logging.info(
                'nothing returned from linguistic features parseText.')
            return json.dumps({})
Пример #2
0
    def parse(self, text):
        result = {}
        doc = None
        try:
            text = urllib.unquote(text)
            text = text.decode('utf-8')
            doc = lf.parseText(text, self.english)

        except Exception as ee:
            logging.info('error in utf-8 decode - (will attempt latin-1 decode)')
            logging.info(ee)
            doc = lf.parseText(text.decode('latin-1'), self.english)

        if doc is not None:
            try:
                functions = [lf.tagSentences, lf.tagPOSTags, lf.tagNamedEntities, lf.tagNounChunks]
                lf.run_functions(doc, result, *functions)

                if 'named_entities' in result:
                    return json.dumps(result['named_entities'])
                else:
                    raise NLPException('No named entities found.')
            except Exception as e:
                logging.error("Error in NLP: %s" % e)
                return json.dumps({})
        else:
            logging.info('nothing returned from linguistic features parseText.')
            return json.dumps({})
Пример #3
0
def run_nlp(input_text='', input_file='', input_directory='', output_directory='', output_json=True):
    """Extracts named entities (place names, etc...) from text through the use of natural language processing (NLP)l
    :param input_text: input text to be processed -- optional
    :param input_file: input file to be processed -- optional
    :param input_directory: a directory of files to be processed
    :param output_directory: an output directory where the output results are written to a JSON file -- optional
    :param output_json: specify to output json result or not
    """

    logging.info("recieved input text: %s " % input_text)

    # Load the English model.
    nlp = English()

    source = None
    if input_text:
        source = [('stdin', input_text)]

    if not source:
        file_path = input_file
        with codecs.open(file_path, 'r', 'utf-8') as f:
            source = [(file_path, f.read())]

    if not source:
        if input_directory:
            source = read_files(input_directory)

    if not source:
        source = ['stdin', sys.stdin.read()]

    if source:
        for name, text in source:
            result = {}
            try:
                doc = lf.parseText(text, nlp)
            except (UnicodeError, TypeError):
                try:
                    doc = lf.parseText(text.decode('utf-8'), nlp)
                except UnicodeDecodeError:
                    doc = lf.parseText(text.decode('latin-1'), nlp)

            # The list of functions to run.
            functions = [lf.tagSentences, lf.tagPOSTags, lf.tagNamedEntities, lf.tagNounChunks]
            lf.run_functions(doc, result, *functions)

            # Write results to a file.
            if 'named_entities' in result:
                if output_directory:
                    handle_result(result['named_entities'], name, output_directory)
                else:
                    if output_json is True:
                        return json.dumps(result['named_entities'])
                    else:
                        return result['named_entities']
            else:
                raise NLPException('No named entities found.')
    else:
        raise NLPException('No text to process.')
Пример #4
0
    def parse(self, text):
        text = urllib.unquote(text)
        result = {}
        try:
            doc = lf.parseText(text, self.english)
        except (UnicodeError, TypeError):
            try:
                doc = lf.parseText(text.decode('utf-8'), self.english)
            except UnicodeDecodeError:
                doc = lf.parseText(text.decode('latin-1'), self.english)

        try:
            functions = [lf.tagSentences, lf.tagPOSTags, lf.tagNamedEntities, lf.tagNounChunks]
            lf.run_functions(doc, result, *functions)

            if 'named_entities' in result:
                return json.dumps(result['named_entities'])
            else:
                raise NLPException('No named entities found.')
        except Exception as e:
            logging.error("Error in NLP: %s" % e)
Пример #5
0
    def parse(self, text):
        text = urllib.unquote(text)
        result = {}
        try:
            doc = lf.parseText(text, self.english)
        except (UnicodeError, TypeError):
            try:
                doc = lf.parseText(text.decode('utf-8'), self.english)
            except UnicodeDecodeError:
                doc = lf.parseText(text.decode('latin-1'), self.english)

        try:
            functions = [
                lf.tagSentences, lf.tagPOSTags, lf.tagNamedEntities,
                lf.tagNounChunks
            ]
            lf.run_functions(doc, result, *functions)

            if 'named_entities' in result:
                return json.dumps(result['named_entities'])
            else:
                raise NLPException('No named entities found.')
        except Exception as e:
            logging.error("Error in NLP: %s" % e)
Пример #6
0
def run_nlp(input_text='',
            input_file='',
            input_directory='',
            output_directory='',
            output_json=True):
    """Extracts named entities (place names, etc...) from text through the use of natural language processing (NLP)l
    :param input_text: input text to be processed -- optional
    :param input_file: input file to be processed -- optional
    :param input_directory: a directory of files to be processed
    :param output_directory: an output directory where the output results are written to a JSON file -- optional
    :param output_json: specify to output json result or not
    """

    logging.info("recieved input text: %s " % input_text)

    # Load the English model.
    nlp = English()

    source = None
    if input_text:
        source = [('stdin', input_text)]

    if not source:
        file_path = input_file
        with codecs.open(file_path, 'r', 'utf-8') as f:
            source = [(file_path, f.read())]

    if not source:
        if input_directory:
            source = read_files(input_directory)

    if not source:
        source = ['stdin', sys.stdin.read()]

    if source:
        for name, text in source:
            result = {}
            try:
                doc = lf.parseText(text, nlp)
            except (UnicodeError, TypeError):
                try:
                    doc = lf.parseText(text.decode('utf-8'), nlp)
                except UnicodeDecodeError:
                    doc = lf.parseText(text.decode('latin-1'), nlp)

            # The list of functions to run.
            functions = [
                lf.tagSentences, lf.tagPOSTags, lf.tagNamedEntities,
                lf.tagNounChunks
            ]
            lf.run_functions(doc, result, *functions)

            # Write results to a file.
            if 'named_entities' in result:
                if output_directory:
                    handle_result(result['named_entities'], name,
                                  output_directory)
                else:
                    if output_json is True:
                        return json.dumps(result['named_entities'])
                    else:
                        return result['named_entities']
            else:
                raise NLPException('No named entities found.')
    else:
        raise NLPException('No text to process.')