Пример #1
0
def entities_text(text):
    """Detects entities in the text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    # entity types from enums.Entity.Type
    entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION', 'EVENT',
                   'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')

    for entity in entities:
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
                                   entity.metadata.get('wikipedia_url', '-')))
Пример #2
0
def entity_sentiment_text(text):
    """Detects entity sentiment in the provided text."""
    # [START beta_client]
    client = language_v1beta2.LanguageServiceClient()
    # [END beta_client]

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(content=text.encode('utf-8'),
                              type=enums.Document.Type.PLAIN_TEXT)

    # Pass in encoding type to get useful offsets in the response.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)

    for entity in result.entities:
        print('Mentions: ')
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))
Пример #3
0
def getTextTopic(searchString):
    try:
        """Classifies content categories of the provided text."""
        client = language_v1beta2.LanguageServiceClient()

        document = types.Document(content=searchString,
                                  type=enums.Document.Type.PLAIN_TEXT)

        categories = client.classify_text(document).categories

        print categories

        # for category in categories:
        if not categories:
            return []
        else:
            category = {
                'name': categories[0].name,
                'confidence': categories[0].confidence
            }
            return category

        # return category
        #print(u'=' * 20)
        #print(u'{:<16}: {}'.format('name', category.name))
        #print(u'{:<16}: {}'.format('confidence', category.confidence))

    except ValueError, e:
        return ''
Пример #4
0
def entity_sentiment_file(gcs_uri):
    """Detects entity sentiment in a Google Cloud Storage file."""
    client = language_v1beta2.LanguageServiceClient()

    document = types.Document(
        gcs_content_uri=gcs_uri,
        type=enums.Document.Type.PLAIN_TEXT)

    # Pass in encoding type to get useful offsets in the response.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)

    for entity in result.entities:
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))
Пример #5
0
def doEntitiyAnalysis(searchString):
    try:
        """Detects entities in the text."""
        client = language_v1beta2.LanguageServiceClient()

        if isinstance(searchString, six.binary_type):
            text = searchString.decode('utf-8')

        # Instantiates a plain text document.
        document = types.Document(content=text,
                                  type=enums.Document.Type.PLAIN_TEXT)

        # Detects entities in the document. You can also analyze HTML with:
        #   document.type == enums.Document.Type.HTML
        entities = client.analyze_entities(document).entities

        for entity in entities:
            print('=' * 20)
            print(u'{:<16}: {}'.format('name', entity.name))
            print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
            print(u'{:<16}: {}'.format('metadata', entity.metadata))
            print(u'{:<16}: {}'.format('salience', entity.salience))
            print(u'{:<16}: {}'.format(
                'wikipedia_url', entity.metadata.get('wikipedia_url', '-')))

    except ValueError, e:
        return ''
def classify(text, verbose=True):
    """Classify the input text into categories. """

    language_client = language_v1beta2.LanguageServiceClient()

    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
    response = language_client.classify_text(document)
    categories = response.categories

    result = {}

    for category in categories:
        # Turn the categories into a dictionary of the form:
        # {category.name: category.confidence}, so that they can
        # be treated as a sparse vector.
        result[category.name] = category.confidence

    if verbose:
        print(text)
        for category in categories:
            print(u'=' * 20)
            print(u'{:<16}: {}'.format('category', category.name))
            print(u'{:<16}: {}'.format('confidence', category.confidence))

    return result
Пример #7
0
def getMostRelevantEntity(searchString):
    try:
        """Detects entities in the text."""
        client = language_v1beta2.LanguageServiceClient()

        # if isinstance(searchString, six.binary_type):
        text = searchString.decode('utf-8')

        # Instantiates a plain text document.
        document = types.Document(content=text,
                                  type=enums.Document.Type.PLAIN_TEXT)

        # Detects entities in the document. You can also analyze HTML with:
        #   document.type == enums.Document.Type.HTML
        entities = client.analyze_entities(document).entities

        for entity in entities:
            if entity_type[entity.type] == 'PERSON':
                return_entity = entity
                break

        result = {
            'name': entity.name,
            'salience': entity.salience,
            'wikipedia_url': entity.metadata.get('wikipedia_url', '-')
        }
        return result

    except ValueError, e:
        return ''
Пример #8
0
def syntax_file(gcs_uri):
    """Detects syntax in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)
    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens
    return tokens
Пример #9
0
def classify_file(gcs_uri):
    """Classifies the text in a Google Cloud Storage file."""
    client = language_v1beta2.LanguageServiceClient()

    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    categories = client.classify_text(document).categories

    for category in categories:
        print(u'=' * 20)
        print(u'{:<16}: {}'.format('name', category.name))
        print(u'{:<16}: {}'.format('confidence', category.confidence))
Пример #10
0
def parse_text(text):
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(content=text.encode('utf-8'),
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detect and send native Python encoding to receive correct word offsets.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16
    result = client.analyze_entity_sentiment(document, encoding)

    keywords = []
    categories = []

    for entity in result.entities:
        """print('Mentions: ')
        print(u'Name: "{}"'.format(entity.name))
        for mention in entity.mentions:
            print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
            print(u'  Content : {}'.format(mention.text.content))
            print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
            print(u'  Sentiment : {}'.format(mention.sentiment.score))
            print(u'  Type : {}'.format(mention.type))
        print(u'Salience: {}'.format(entity.salience))
        print(u'Sentiment: {}\n'.format(entity.sentiment))"""
        for mention in entity.mentions:
            if mention.sentiment.score > 0 and entity.name not in keywords:
                keywords.append(entity.name.lower())

    sections = text.strip().split("SEC.")
    language_client = language_v1beta2.LanguageServiceClient()
    for section in sections:
        subsections = section.strip().split("    (")
        for i in range(0, len(subsections)):
            subsection = subsections[i]
            if len(subsection) > 750:
                document = types2.Document(
                    content=subsection.encode('utf-8'),
                    type=enums2.Document.Type.PLAIN_TEXT)
                result = language_client.classify_text(document)
                for category in result.categories:
                    categories.append(category.name)
            else:
                if i < len(subsections) - 1:
                    subsections[i +
                                1] = subsections[i] + " " + subsections[i + 1]
    return keywords, categories
Пример #11
0
def get_topic(article):
    language_client = language_v1beta2.LanguageServiceClient()
    document = types_topic.Document(content=f"{article['cleaned_text']}",
                                    type=enums_topic.Document.Type.PLAIN_TEXT)
    result = language_client.classify_text(document)
    highest_confidence = []
    for category in result.categories:
        highest_confidence.append({
            'category': category.name,
            'confidence': category.confidence
        })

    highest = max(highest_confidence, key=lambda x: x['confidence'])
    return filter_topic(highest['category'])
Пример #12
0
def classify(text):
    language_client = language_v1beta2.LanguageServiceClient()

    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)
    result = language_client.classify_text(document)
    newsConfidence = None
    for category in result.categories:
        #print("Hi")
        #print(category.name)
        if "/News" in category.name:
            newsConfidence = category.confidence
            break
    return newsConfidence
Пример #13
0
def sentiment_file(gcs_uri):
    """Detects sentiment in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment

    print('Score: {}'.format(sentiment.score))
    print('Magnitude: {}'.format(sentiment.magnitude))
Пример #14
0
def syntax_text(text):
    """Detects syntax in the text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens
    return tokens
Пример #15
0
def classify_text(text):
    """Classifies content categories of the provided text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(content=text.encode('utf-8'),
                              type=enums.Document.Type.PLAIN_TEXT)

    categories = client.classify_text(document).categories

    for category in categories:
        print(u'=' * 20)
        print(u'{:<16}: {}'.format('name', category.name))
        print(u'{:<16}: {}'.format('confidence', category.confidence))
Пример #16
0
def sentiment_text(text):
    """Detects sentiment in the text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    sentiment = client.analyze_sentiment(document).document_sentiment

    print('Score: {}'.format(sentiment.score))
    print('Magnitude: {}'.format(sentiment.magnitude))
def syntax_file(gcs_uri):
    """Detects syntax in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens

    # part-of-speech tags from enums.PartOfSpeech.Tag
    pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
               'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')

    for token in tokens:
        print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
                               token.text.content))
Пример #18
0
def run_quickstart():
    # [START language_quickstart]
    # Imports the Google Cloud client library
    from google.cloud import language_v1beta2
    from google.cloud.language_v1beta2 import enums
    from google.cloud.language_v1beta2 import types

    # Instantiates a client with the v1beta2 version
    client = language_v1beta2.LanguageServiceClient()

    # The text to analyze
    text = u'Hallo Welt!'
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT,
                              language='de')
    # Detects the sentiment of the text
    sentiment = client.analyze_sentiment(document).document_sentiment

    print('Text: {}'.format(text))
    print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))
Пример #19
0
def doSentimentAnalysis(searchString):
    try:
        askstr = searchString.encode('utf-8')
        print askstr

        document = types.Document(content=askstr,
                                  type=enums.Document.Type.PLAIN_TEXT)

        # Instantiates a client
        client = language_v1beta2.LanguageServiceClient()

        sentiment = client.analyze_sentiment(
            document=document).document_sentiment

        #print('Text: {}'.format(text))
        #print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))
        print str(sentiment.score)
        return ((str(sentiment.score), str(sentiment.magnitude)))
    except ValueError, e:
        return ''
def entities_file(gcs_uri):
    """Detects entities in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    for entity in entities:
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity.type))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
                                   entity.metadata.get('wikipedia_url', '-')))
def syntax_text(text):
    """Detects syntax in the text."""
    client = language_v1beta2.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    document = types.Document(content=text,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens

    # part-of-speech tags from enums.PartOfSpeech.Tag
    pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
               'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')

    for token in tokens:
        print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
                               token.text.content))
Пример #22
0
def entities_file(gcs_uri):
    """Detects entities in the file located in Google Cloud Storage."""
    client = language_v1beta2.LanguageServiceClient()

    # Instantiates a plain text document.
    document = types.Document(gcs_content_uri=gcs_uri,
                              type=enums.Document.Type.PLAIN_TEXT)

    # Detects sentiment in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    # entity types from enums.Entity.Type
    entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION', 'EVENT',
                   'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')

    for entity in entities:
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
                                   entity.metadata.get('wikipedia_url', '-')))
Пример #23
0
def get_bill(id, session):
    url = 'https://api.propublica.org/congress/v1/%s/bills/%s.json' % (session,
                                                                       id)
    headers = {'X-API-Key': 'gt6jsrJY8cXmh6WmRYwK0820BFfrtZlf25fJSKlo'}
    req = urllib.request.Request(url, None, headers)
    response = urllib.request.urlopen(req).read()
    billinfo = json.loads(response)['results'][0]
    chamber = ""
    if billinfo['bill_type'][0] == 'h':
        chamber = 'house'
    elif billinfo['bill_type'][0] == 's':
        chamber = 'senate'
    sponsor_funding_list = get_congressman(billinfo['sponsor'], chamber)

    cosponsors_funding_lists = {}
    cosponsor_url = 'https://api.propublica.org/congress/v1/%s/bills/%s/cosponsors.json' % (
        session, id)
    cosponsor_headers = {
        'X-API-Key': 'gt6jsrJY8cXmh6WmRYwK0820BFfrtZlf25fJSKlo'
    }
    cosponsor_req = urllib.request.Request(cosponsor_url, None,
                                           cosponsor_headers)
    cosponsor_response = urllib.request.urlopen(cosponsor_req).read()
    cosponsor_list = json.loads(cosponsor_response)['results'][0]['cosponsors']
    for cosponsor in cosponsor_list:
        cosponsors_funding_lists[cosponsor['name']] = get_congressman(
            cosponsor["name"], chamber)
    funding_list = dict(cosponsors_funding_lists)
    funding_list[billinfo['sponsor']] = sponsor_funding_list

    if chamber == 'house':
        bill_url = 'https://www.gpo.gov/fdsys/pkg/BILLS-' + str(session)\
               + str(id) + 'ih/html/BILLS-' + str(session) + str(id) + 'ih.htm'
    elif chamber == 'senate':
        bill_url = 'https://www.gpo.gov/fdsys/pkg/BILLS-' + str(session) \
                   + str(id) + 'is/html/BILLS-' + str(session) + str(id) + 'is.htm'
    bill_headers = {'User-Agent': 'Mozilla/5.0'}
    bill_req = urllib.request.Request(bill_url, None, bill_headers)
    bill_response = urllib.request.urlopen(bill_req).read().decode("utf-8")
    keywords, categories = parse_text(html2text.html2text(bill_response))

    words_to_check = []
    #hypernyms = []
    for word in keywords:
        try:
            if include(word) and ' ' not in word:
                syn = nltk.corpus.wordnet.synsets(word)
                words_to_check = words_to_check + syn
                """paths = syn.hypernym_paths()
                for path in paths:
                    hypernyms = hypernyms + path"""
            else:
                lst = word.strip().split(' ')
                for w in lst:
                    if include(word):
                        syn = nltk.corpus.wordnet.synsets(w)
                        words_to_check = words_to_check + syn
                        """paths = syn.hypernym_paths()
                        for path in paths:
                            hypernyms = hypernyms + path"""
        except (nltk.corpus.reader.wordnet.WordNetError):
            dummy = None
    syn_words = []
    for synword in words_to_check:
        word = synword.name().strip().split(".")[0].replace("_", " ")
        if include(word):
            syn_words.append(word)
    words_to_check = set([word
                          for word in keywords if include(word)] + syn_words)

    relevant_list = {}
    for sponsor in funding_list:
        sponsor_relevant_list = {}
        if funding_list[sponsor] != None and funding_list[sponsor][0] != None:
            for company in funding_list[sponsor][0]:
                try:
                    wikipage = wptools.page(company.replace(" ", "_"))
                    pagedata = wikipage.get_query().data['extext']
                    for word in words_to_check:
                        if word in pagedata:
                            sponsor_relevant_list[company] = funding_list[
                                sponsor][0][company]

                    language_client = language_v1beta2.LanguageServiceClient()
                    document = types2.Document(
                        content=pagedata, type=enums2.Document.Type.PLAIN_TEXT)
                    result = language_client.classify_text(document)
                    for category in result.categories:
                        flag = False
                        for bill_category in categories:
                            if category.name in bill_category or bill_category in category.name:
                                flag = True
                    if flag == True:
                        sponsor_relevant_list[company] = funding_list[sponsor][
                            0][company]
                except (LookupError):
                    dummy = None
            relevant_list[sponsor] = sponsor_relevant_list
    print(json.dumps(relevant_list, indent=4, separators=(',', ': ')))
    return relevant_list
Пример #24
0
 def getTag(content_input):
     document = types.Document(content=content_input,
                               type=enums.Document.Type.PLAIN_TEXT)
     result = language_client.classify_text(document)
     return result
Пример #25
0
from google.cloud import language_v1beta2
from google.cloud.language_v1beta2 import enums
from google.cloud.language_v1beta2 import types

language_client = language_v1beta2.LanguageServiceClient()

document = types.Document(content='''
    Rafael Montero Shines in Mets Victory Over the Reds. Montero, who was demoted at midseason, took
     a one-hitter into the ninth inning as the Mets continued to dominate Cincinnati with a win at 
     Great American Ball Park.''',
                          type=enums.Document.Type.PLAIN_TEXT)

result = language_client.classify_text(document)

for category in result.categories:
    print('category name: ', category.name)
    print('category confidence: ', category.confidence, '\n')