Пример #1
0
 def testExtractorFromUrl(self):
     '''
     test the extractor
     '''
     url = 'https://en.wikipedia.org/wiki/Louvre'
     e = Extractor(url=url)
     e.find_geoEntities()
     self.check(e.places, ['Paris', 'France'])
Пример #2
0
 def testStackOverflow54721435(self):
     '''
     see https://stackoverflow.com/questions/54721435/unable-to-extract-city-names-from-a-text-using-geograpypython
     '''
     text = 'I live in Kadawatha a suburb of Colombo  Sri Lanka'
     e = Extractor(text=text)
     e.find_entities()
     print(e.places)
Пример #3
0
 def testStackoverflow54077973(self):
     '''
     see https://stackoverflow.com/questions/54077973/geograpy3-library-for-extracting-the-locations-in-the-text-gives-unicodedecodee
     '''
     address = 'Jersey City New Jersey 07306'
     e = Extractor(text=address)
     e.find_entities()
     self.check(e.places, ['Jersey', 'City'])
Пример #4
0
 def testExtractorFromUrl(self):
     '''
     test the extractor
     '''
     url = 'http://www.bbc.com/news/world-europe-26919928'
     e = Extractor(url=url)
     e.find_geoEntities()
     self.check(e.places, ['Russia', 'Kiev', 'Ukraine'])
Пример #5
0
 def testStackoverflow43322567(self):
     '''
     see https://stackoverflow.com/questions/43322567
     '''
     url = 'https://en.wikipedia.org/wiki/U.S._state'
     e = Extractor(url=url)
     places = e.find_geoEntities()
     self.check(places, ['Alabama', 'Virginia', 'New York'])
     print(places)
Пример #6
0
 def testStackoverflow54712198(self):
     '''
     see https://stackoverflow.com/questions/54712198/not-only-extracting-places-from-a-text-but-also-other-names-in-geograpypython
     '''
     text = '''Opposition Leader Mahinda Rajapaksa says that the whole public administration has collapsed due to the constitution council’s arbitrary actions. The Opposition Leader said so in response to a query a journalised raised after a meeting held...'''
     e = Extractor(text)
     places = e.find_geoEntities()
     if self.debug:
         print(places)
     self.assertEqual([], places)
Пример #7
0
def get_place_context(url=None, text=None):
    e = Extractor(url=url, text=text)
    e.find_entities()

    pc = PlaceContext(e.places)
    pc.set_countries()
    pc.set_regions()
    pc.set_cities()
    pc.set_other()

    return pc
Пример #8
0
def locateCity(location, correctMisspelling=False, debug=False):
    '''
    locate the given location string
    Args:
        location(string): the description of the location
    Returns:
        Locator: the location
    '''
    e = Extractor(text=location, debug=debug)
    e.split()
    loc = Locator.getInstance(correctMisspelling=correctMisspelling,
                              debug=debug)
    city = loc.locateCity(e.places)
    return city
Пример #9
0
    def testStackoverflow55548116(self):
        '''
        see https://stackoverflow.com/questions/55548116/geograpy3-library-is-not-working-properly-and-give-traceback-error
        '''
        feedContent = ['Las Vegas is a city in Nevada']
        placesInFeed = []

        for content in feedContent:
            if content != "":
                e = Extractor(text=content)
                e.find_entities()
                places = e.places
                if self.debug:
                    print(places)
                placesInFeed.append(places)
Пример #10
0
def get_place_context(url=None, text=None):
    e = Extractor(url=url, text=text)
    e.find_entities()

    pc = PlaceContext(e.places, e.people, e.organs)
    pc.set_countries()
    pc.set_regions()
    pc.set_cities()
    pc.set_other()

    return pc


# url = 'http://www.bbc.com/news/world-us-canada-39821789'
# places = get_place_context(url=url)
# len(places)
Пример #11
0
def get_place_context(url=None, text=None, labels=Labels.default, debug=False):
    '''
    Get a place context for a given text with information
    about country, region, city and other
    based on NLTK Named Entities in the label set Geographic(GPE), 
    Person(PERSON) and Organization(ORGANIZATION).
    
    Args:
        url(String): the url to read text from (if any)
        text(String): the text to analyze
        debug(boolean): if True show debug information
    
    Returns:
        pc: 
            PlaceContext: the place context
    '''
    e = Extractor(url=url, text=text, debug=debug)
    e.find_entities(labels=labels)

    pc = PlaceContext(e.places)
    pc.setAll()
    return pc
Пример #12
0
def test():
    e = Extractor(url='http://www.bbc.com/news/world-europe-26919928')
    e.find_entities()

    assert len(e.places) > 0
    assert 'Russia' in e.places
    assert 'Kiev' in e.places

    text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a 
    Friday evening! horrible traffic here is your cue to become worse @Ma3Route """

    e2 = Extractor(text=text)
    e2.find_entities()

    assert len(e2.places) > 0
    assert 'Nairobi' in e2.places

    text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """
    e3 = Extractor(text=text3)
    e3.find_entities()

    assert len(e3.places) > 0
    assert 'Nairobi' in e3.places

    text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """
    e4 = Extractor(text=text4)
    e4.find_entities()

    assert len(e4.places) > 0
    assert 'Nairobi' in e4.places
    assert 'Ngong' in e4.places
Пример #13
0
def test():
    e = Extractor(url='http://www.bbc.com/news/world-europe-26919928')
    e.find_entities()

    assert len(e.places) > 0
    assert 'Russia' in e.places
    assert 'Kiev' in e.places

    text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a 
    Friday evening! horrible traffic here is your cue to become worse @Ma3Route """

    e2 = Extractor(text=text)
    e2.find_entities()

    assert len(e2.places) > 0
    assert 'Nairobi' in e2.places

    text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """
    e3 = Extractor(text=text3)
    e3.find_entities()

    assert len(e3.places) > 0
    assert 'Nairobi' in e3.places

    text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """
    e4 = Extractor(text=text4)
    e4.find_entities()

    assert len(e4.places) > 0
    assert 'Nairobi' in e4.places
    assert 'Ngong' in e4.places

    # unicode
    text5 = u""" There is a city called New York in the United States."""
    e5 = Extractor(text=text5)
    e5.find_entities()

    print e5.places
    assert len(e5.places) == 2
    assert u'New York' in e5.places
    assert u'United States' in e5.places

    # unicode and two words
    text6 = u""" There is a city called São Paulo in Brazil."""
    e6 = Extractor(text=text6)
    e6.find_entities()

    print e6.places
    assert len(e6.places) > 1
    assert u'São Paulo' in e6.places
Пример #14
0
def collect_news():
    print('Running collecting news')
    retrive_news_from_firebase()
    # check_news_dataset()
    set_entityCount()

    url = [
        "http://www.adaderana.lk/rss.php",
        "http://www.hirunews.lk/rss/english.xml",
        "https://www.news.lk/news?format=feed",
        "https://srilankamirror.com/news?format=feed&type=rss",
        "http://www.thesundayleader.lk/feed/", "https://www.newsfirst.lk/feed/"
    ]
    for url in url:
        # print(url)

        # read the rss feeds from urls
        feedParsed = feedparser.parse(url)
        # print(feedParsed)
        # check whether the rss reading success or not
        if feedParsed.feed != {}:
            global news_in_db
            global entityCount
            for post in feedParsed.entries:

                if is_news_already_exist_in_db(post.title) != True:

                    category = classify_news(post.title)

                    newsObj = News(post.title, post.description, post.summary,
                                   post.link, category, post.published,
                                   entityCount)
                    newsObjects.append(newsObj)

                    locations = Extractor(
                        text=post.description)  # Extract location
                    locations.find_entities()
                    # print(locations.places)  # locations is an array

                    newsObj.add_locations(locations.places)

                    # data = {
                    #     "title": newsObj.title,
                    #     "id":entityCount,
                    #     "description": newsObj.description,
                    #     "summary": newsObj.summery,
                    #     "link": newsObj.link,
                    #     "category": newsObj.category,
                    #     "locations": newsObj.locations,
                    #     "date_time": newsObj.date_time
                    # }
                    # firebase.database().set(data)
                    global db

                    doc_ref = db.collection(u'news').document()
                    doc_ref.set({
                        u'title': newsObj.title,
                        u'news_id': newsObj.news_id,
                        u'description': newsObj.description,
                        u'summary': newsObj.summary,
                        u'link': newsObj.link,
                        u'category': newsObj.category,
                        u'locations': newsObj.locations,
                        u'date_time': newsObj.date_time
                    })

                    update_news_dataset(newsObj)
                    # db.collection(u'newsAppData').document(u'news').set(newsObj)
                    print("feed " + str(newsObj.news_id) + " : " +
                          str(newsObj.title))
                    print('category: ', category, '. time ', newsObj.date_time,
                          ' . locations:', newsObj.locations)
                    entityCount = entityCount + 1

        else:

            print('Connection failed with url :', url)
    WAIT_SECONDS = 100  # timer for thread
    print(time.ctime())
    news_in_db.clear()
    threading.Timer(WAIT_SECONDS, collect_news).start()
Пример #15
0
    def testExtractorFromText(self):
        '''
        test different texts for getting geo context information
        '''
        text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a 
        Friday evening! horrible traffic here is your cue to become worse @Ma3Route """

        e2 = Extractor(text=text)
        e2.find_entities()
        self.check(e2.places, ['Nairobi'])

        text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """
        e3 = Extractor(text=text3)
        e3.find_entities()
        self.check(e3.places, ['Nairobi'])

        text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """
        e4 = Extractor(text=text4)
        e4.find_entities()
        self.check(e4.places, ['Nairobi', 'Ngong'])

        # unicode
        text5 = u""" There is a city called New York in the United States."""
        e5 = Extractor(text=text5)
        e5.find_entities()
        self.check(e5.places, ['New York', 'United States'])

        # unicode and two words
        text6 = u""" There is a city called São Paulo in Brazil."""
        e6 = Extractor(text=text6)
        e6.find_entities()
        self.check(e6.places, ['São Paulo'])
Пример #16
0
def test():
    e = Extractor(url='http://www.bbc.com/news/world-europe-26919928')
    e.find_entities()

    assert len(e.places) > 0
    assert 'Russia' in e.places
    assert 'Kiev' in e.places
    
    text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a 
    Friday evening! horrible traffic here is your cue to become worse @Ma3Route """

    e2 = Extractor(text=text)
    e2.find_entities()

    assert len(e2.places) > 0
    assert 'Nairobi' in e2.places

    text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """
    e3 = Extractor(text=text3)
    e3.find_entities()

    assert len(e3.places) > 0
    assert 'Nairobi' in e3.places

    text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """
    e4 = Extractor(text=text4)
    e4.find_entities()

    assert len(e4.places) > 0
    assert 'Nairobi' in e4.places
    assert 'Ngong' in e4.places