Пример #1
0
def getMostRelevantEntity(searchString):
    try:
        """Detects entities in the text."""
        client = language_v1beta2.LanguageServiceClient()

        # if isinstance(searchString, six.binary_type):
        text = searchString.decode('utf-8')

        # Instantiates a plain text document.
        document = types.Document(content=text,
                                  type=enums.Document.Type.PLAIN_TEXT)

        # Detects entities in the document. You can also analyze HTML with:
        #   document.type == enums.Document.Type.HTML
        entities = client.analyze_entities(document).entities

        for entity in entities:
            if entity_type[entity.type] == 'PERSON':
                return_entity = entity
                break

        result = {
            'name': entity.name,
            'salience': entity.salience,
            'wikipedia_url': entity.metadata.get('wikipedia_url', '-')
        }
        return result

    except ValueError, e:
        return ''
Пример #2
0
def parse_file(resume):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(
        os.path.dirname(__file__), 'Parsing-385521996355.json')
    print("WTF this was working yesterday",
          os.environ['GOOGLE_APPLICATION_CREDENTIALS'])
    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        os.path.join(os.path.dirname(__file__), 'Parsing-385521996355.json'),
        scopes='https://www.googleapis.com/auth/cloud-language')
    #credentials = ServiceAccountCredentials.from_json_keyfile_name('/home/jared/Parsing-385521996355.json',scopes ='https://www.googleapis.com/auth/cloud-language' )
    client = language_v1beta2.LanguageServiceClient()

    document = types.Document(content=resume,
                              type=enums.Document.Type.PLAIN_TEXT)
    #categories = client.classify_text(document).categories
    ent = client.analyze_entities(document=document).entities
    #client = language.LanguageServiceClient()
    parsed = {}
    #execution of extracting name, email, phone number
    parsed['person'] = personel_information(resume)
    #extract major minor of undergrad need work on grad
    parsed['education'] = extract_School(resume, ent)
    #extract companies and work experience but needs a lot of work
    parsed['work'] = extract_company(resume, ent)
    #extracts skills really well but takes time
    parsed['skills'] = extract_all_skills(resume)
    return parsed
def classify(text, verbose=True):
    """Classify the input text into categories. """

    language_client = language_v1beta2.LanguageServiceClient()

    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
    response = language_client.classify_text(document)
    categories = response.categories

    result = {}

    for category in categories:
        # Turn the categories into a dictionary of the form:
        # {category.name: category.confidence}, so that they can
        # be treated as a sparse vector.
        result[category.name] = category.confidence

    if verbose:
        print(text)
        for category in categories:
            print(u'=' * 20)
            print(u'{:<16}: {}'.format('category', category.name))
            print(u'{:<16}: {}'.format('confidence', category.confidence))

    return result
Пример #4
0
    def test_analyze_sentiment(self):

        client = language_v1beta2.LanguageServiceClient()
        content = "Hello, world!"
        type_ = enums.Document.Type.PLAIN_TEXT
        document = {"content": content, "type": type_}
        response = client.analyze_sentiment(document)
    def test_analyze_entities(self, mock_create_stub):
        # Mock gRPC layer
        grpc_stub = mock.Mock()
        mock_create_stub.return_value = grpc_stub

        client = language_v1beta2.LanguageServiceClient()

        # Mock request
        document = {}

        # Mock response
        language = 'language-1613589672'
        expected_response = {'language': language}
        expected_response = language_service_pb2.AnalyzeEntitiesResponse(
            **expected_response)
        grpc_stub.AnalyzeEntities.return_value = expected_response

        response = client.analyze_entities(document)
        self.assertEqual(expected_response, response)

        grpc_stub.AnalyzeEntities.assert_called_once()
        args, kwargs = grpc_stub.AnalyzeEntities.call_args
        self.assertEqual(len(args), 2)
        self.assertEqual(len(kwargs), 1)
        self.assertIn('metadata', kwargs)
        actual_request = args[0]

        expected_request = language_service_pb2.AnalyzeEntitiesRequest(
            document=document)
        self.assertEqual(expected_request, actual_request)
Пример #6
0
def entity_sentiment_text(text):
    """Detects entity sentiment in the provided text."""
    # [START beta_client]
    client = language_v1beta2.LanguageServiceClient()
    # [END beta_client]

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(content=text.encode('utf-8'),
                              type=enums.Document.Type.PLAIN_TEXT)

    # Pass in encoding type to get useful offsets in the response.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)

    for entity in result.entities:
        print('Mentions: ')
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))
    def test_analyze_sentiment(self):
        # Setup Expected Response
        language = "language-1613589672"
        expected_response = {"language": language}
        expected_response = language_service_pb2.AnalyzeSentimentResponse(
            **expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = language_v1beta2.LanguageServiceClient()

        # Setup Request
        document = {}

        response = client.analyze_sentiment(document)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = language_service_pb2.AnalyzeSentimentRequest(
            document=document)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Пример #8
0
    def test_annotate_text(self):
        # Setup Expected Response
        language = 'language-1613589672'
        expected_response = {'language': language}
        expected_response = language_service_pb2.AnnotateTextResponse(
            **expected_response)

        # Mock the API response
        channel = ChannelStub(responses=[expected_response])
        patch = mock.patch('google.api_core.grpc_helpers.create_channel')
        with patch as create_channel:
            create_channel.return_value = channel
            client = language_v1beta2.LanguageServiceClient()

        # Setup Request
        document = {}
        features = {}

        response = client.annotate_text(document, features)
        assert expected_response == response

        assert len(channel.requests) == 1
        expected_request = language_service_pb2.AnnotateTextRequest(
            document=document, features=features)
        actual_request = channel.requests[0][1]
        assert expected_request == actual_request
Пример #9
0
def getTextTopic(searchString):
    try:
        """Classifies content categories of the provided text."""
        client = language_v1beta2.LanguageServiceClient()

        document = types.Document(content=searchString,
                                  type=enums.Document.Type.PLAIN_TEXT)

        categories = client.classify_text(document).categories

        print categories

        # for category in categories:
        if not categories:
            return []
        else:
            category = {
                'name': categories[0].name,
                'confidence': categories[0].confidence
            }
            return category

        # return category
        #print(u'=' * 20)
        #print(u'{:<16}: {}'.format('name', category.name))
        #print(u'{:<16}: {}'.format('confidence', category.confidence))

    except ValueError, e:
        return ''
Пример #10
0
def entity_sentiment_file(gcs_uri):
    """Detects entity sentiment in a Google Cloud Storage file."""
    client = language_v1beta2.LanguageServiceClient()

    document = types.Document(
        gcs_content_uri=gcs_uri,
        type=enums.Document.Type.PLAIN_TEXT)

    # Pass in encoding type to get useful offsets in the response.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)

    for entity in result.entities:
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))
Пример #11
0
def getCDID(rows):
    s = []
    language_client = language_v1beta2.LanguageServiceClient()

    def getTag(content_input):
        document = types.Document(content=content_input,
                                  type=enums.Document.Type.PLAIN_TEXT)
        result = language_client.classify_text(document)
        return result

    if rows is not None:
        name = "/home/tnguyen/CREU/CREU/HedgeDetection/parse_data_articles/fulltext/" + rows[
            0] + '.txt'
        print(name)
        st = ''
        conf = ''
        print(name)
        with open(name, 'r') as myfile:
            if os.stat(name).st_size != 0:
                article = myfile.read()
                try:
                    results = getTag(article)
                    if results is not None:
                        for result in results.categories:
                            if result is not None:
                                st = result.name
                                print(st)
                                s.append((rows[0], rows[1], rows[2], rows[3],
                                          rows[4], st))
                except:
                    print("too few words")
            else:
                s.append((rows[0], rows[1], rows[2], rows[3], rows[4], 0))
    return s
Пример #12
0
def entities_text(text):
    """Detects entities in the text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    # entity types from enums.Entity.Type
    entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION', 'EVENT',
                   'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')

    for entity in entities:
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
                                   entity.metadata.get('wikipedia_url', '-')))
    def test_classify_text(self, mock_create_stub):
        # Mock gRPC layer
        grpc_stub = mock.Mock()
        mock_create_stub.return_value = grpc_stub

        client = language_v1beta2.LanguageServiceClient()

        # Mock request
        document = {}

        # Mock response
        expected_response = {}
        expected_response = language_service_pb2.ClassifyTextResponse(
            **expected_response)
        grpc_stub.ClassifyText.return_value = expected_response

        response = client.classify_text(document)
        self.assertEqual(expected_response, response)

        grpc_stub.ClassifyText.assert_called_once()
        args, kwargs = grpc_stub.ClassifyText.call_args
        self.assertEqual(len(args), 2)
        self.assertEqual(len(kwargs), 1)
        self.assertIn('metadata', kwargs)
        actual_request = args[0]

        expected_request = language_service_pb2.ClassifyTextRequest(
            document=document)
        self.assertEqual(expected_request, actual_request)
Пример #14
0
def doEntitiyAnalysis(searchString):
    try:
        """Detects entities in the text."""
        client = language_v1beta2.LanguageServiceClient()

        if isinstance(searchString, six.binary_type):
            text = searchString.decode('utf-8')

        # Instantiates a plain text document.
        document = types.Document(content=text,
                                  type=enums.Document.Type.PLAIN_TEXT)

        # Detects entities in the document. You can also analyze HTML with:
        #   document.type == enums.Document.Type.HTML
        entities = client.analyze_entities(document).entities

        for entity in entities:
            print('=' * 20)
            print(u'{:<16}: {}'.format('name', entity.name))
            print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
            print(u'{:<16}: {}'.format('metadata', entity.metadata))
            print(u'{:<16}: {}'.format('salience', entity.salience))
            print(u'{:<16}: {}'.format(
                'wikipedia_url', entity.metadata.get('wikipedia_url', '-')))

    except ValueError, e:
        return ''
Пример #15
0
    def test_analyze_sentiment_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        client = language_v1beta2.LanguageServiceClient(channel=channel)

        # Setup request
        document = {}

        with pytest.raises(CustomException):
            client.analyze_sentiment(document)
Пример #16
0
def syntax_file(gcs_uri):
    """Detects syntax in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)
    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens
    return tokens
Пример #17
0
def classify_text(text):
    """Classifies content categories of the provided text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = language_v1beta2.types.Document(
        content=text.encode('utf-8'),
        type=language_v1beta2.enums.Document.Type.HTML)

    categories = client.classify_text(document).categories
    return categories
Пример #18
0
def classify_file(gcs_uri):
    """Classifies the text in a Google Cloud Storage file."""
    client = language_v1beta2.LanguageServiceClient()

    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    categories = client.classify_text(document).categories

    for category in categories:
        print(u'=' * 20)
        print(u'{:<16}: {}'.format('name', category.name))
        print(u'{:<16}: {}'.format('confidence', category.confidence))
    def test_analyze_sentiment_exception(self):
        # Mock the API response
        channel = ChannelStub(responses=[CustomException()])
        patch = mock.patch("google.api_core.grpc_helpers.create_channel")
        with patch as create_channel:
            create_channel.return_value = channel
            client = language_v1beta2.LanguageServiceClient()

        # Setup request
        document = {}

        with pytest.raises(CustomException):
            client.analyze_sentiment(document)
Пример #20
0
def parse_text(text):
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(content=text.encode('utf-8'),
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detect and send native Python encoding to receive correct word offsets.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16
    result = client.analyze_entity_sentiment(document, encoding)

    keywords = []
    categories = []

    for entity in result.entities:
        """print('Mentions: ')
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))"""
        for mention in entity.mentions:
            if mention.sentiment.score > 0 and entity.name not in keywords:
                keywords.append(entity.name.lower())

    sections = text.strip().split("SEC.")
    language_client = language_v1beta2.LanguageServiceClient()
    for section in sections:
        subsections = section.strip().split("    (")
        for i in range(0, len(subsections)):
            subsection = subsections[i]
            if len(subsection) > 750:
                document = types2.Document(
                    content=subsection.encode('utf-8'),
                    type=enums2.Document.Type.PLAIN_TEXT)
                result = language_client.classify_text(document)
                for category in result.categories:
                    categories.append(category.name)
            else:
                if i < len(subsections) - 1:
                    subsections[i +
                                1] = subsections[i] + " " + subsections[i + 1]
    return keywords, categories
Пример #21
0
def get_topic(article):
    language_client = language_v1beta2.LanguageServiceClient()
    document = types_topic.Document(content=f"{article['cleaned_text']}",
                                    type=enums_topic.Document.Type.PLAIN_TEXT)
    result = language_client.classify_text(document)
    highest_confidence = []
    for category in result.categories:
        highest_confidence.append({
            'category': category.name,
            'confidence': category.confidence
        })

    highest = max(highest_confidence, key=lambda x: x['confidence'])
    return filter_topic(highest['category'])
Пример #22
0
def classify(text):
    language_client = language_v1beta2.LanguageServiceClient()

    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)
    result = language_client.classify_text(document)
    newsConfidence = None
    for category in result.categories:
        #print("Hi")
        #print(category.name)
        if "/News" in category.name:
            newsConfidence = category.confidence
            break
    return newsConfidence
    def test_analyze_sentiment_exception(self, mock_create_stub):
        # Mock gRPC layer
        grpc_stub = mock.Mock()
        mock_create_stub.return_value = grpc_stub

        client = language_v1beta2.LanguageServiceClient()

        # Mock request
        document = {}

        # Mock exception response
        grpc_stub.AnalyzeSentiment.side_effect = CustomException()

        self.assertRaises(errors.GaxError, client.analyze_sentiment, document)
Пример #24
0
def sentiment_file(gcs_uri):
    """Detects sentiment in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment

    print('Score: {}'.format(sentiment.score))
    print('Magnitude: {}'.format(sentiment.magnitude))
def sample_analyze_syntax():
    # Create a client
    client = language_v1beta2.LanguageServiceClient()

    # Initialize request argument(s)
    document = language_v1beta2.Document()
    document.content = "content_value"

    request = language_v1beta2.AnalyzeSyntaxRequest(document=document, )

    # Make the request
    response = client.analyze_syntax(request=request)

    # Handle the response
    print(response)
Пример #26
0
def syntax_text(text):
    """Detects syntax in the text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens
    return tokens
Пример #27
0
def classify_text(text):
    """Classifies content categories of the provided text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(content=text.encode('utf-8'),
                              type=enums.Document.Type.PLAIN_TEXT)

    categories = client.classify_text(document).categories

    for category in categories:
        print(u'=' * 20)
        print(u'{:<16}: {}'.format('name', category.name))
        print(u'{:<16}: {}'.format('confidence', category.confidence))
Пример #28
0
def sample_classify_text():
    # Create a client
    client = language_v1beta2.LanguageServiceClient()

    # Initialize request argument(s)
    document = language_v1beta2.Document()
    document.content = "content_value"

    request = language_v1beta2.ClassifyTextRequest(
        document=document,
    )

    # Make the request
    response = client.classify_text(request=request)

    # Handle the response
    print(response)
Пример #29
0
def sentiment_text(text):
    """Detects sentiment in the text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment

    print('Score: {}'.format(sentiment.score))
    print('Magnitude: {}'.format(sentiment.magnitude))
def syntax_file(gcs_uri):
    """Detects syntax in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens

    # part-of-speech tags from enums.PartOfSpeech.Tag
    pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
               'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')

    for token in tokens:
        print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
                               token.text.content))