Пример #1
0
def process_after_parsing(news_item, maps_key):
    location = ''
    news_item['accident'] = classify_ynet(news_item['title'])

    try:
        if news_item['accident']:
            if news_item['description'] != '':
                location = manual_filter_location_of_text(
                    news_item['description'])
            if location == '':
                location = manual_filter_location_of_text(news_item['title'])
            news_item['location'] = location
            geo_location = geocode_extract(location, maps_key)
            if geo_location is not None:
                news_item['lat'] = geo_location['geom']['lat']
                news_item['lon'] = geo_location['geom']['lng']
                news_item['resolution'] = set_accident_resolution(geo_location)
                db_location = get_db_matching_location(news_item['lat'],
                                                       news_item['lon'],
                                                       news_item['resolution'],
                                                       geo_location['road_no'])
                for col in [
                        'region_hebrew', 'district_hebrew', 'yishuv_name',
                        'street1_hebrew', 'street2_hebrew',
                        'non_urban_intersection_hebrew', 'road1', 'road2',
                        'road_segment_name'
                ]:
                    news_item[col] = db_location[col]
    except Exception as _:
        pass
    return news_item
Пример #2
0
def ynet_news_flash_crawl(rss_link, maps_key):
    """
    starts crawling by given rss link, site name and google maps key
    :param rss_link: rss link to crawl and get news_flash from
    :param maps_key: google maps key for geocode
    :return: scraped news_flash are added to the db
    """
    latest_date = get_latest_date_from_db('ynet')
    d = feedparser.parse(rss_link)
    process = CrawlerProcess()
    for entry in d.entries[::-1]:
        entry_parsed_date = datetime.strptime(entry.published[:-6],
                                              '%a, %d %b %Y %H:%M:%S')
        entry_parsed_date = entry_parsed_date.replace(tzinfo=None)
        if (latest_date is not None
                and entry_parsed_date > latest_date) or latest_date is None:
            news_item = {
                'date_parsed': entry_parsed_date,
                'title': entry.title,
                'link': entry.links[0].href,
                'date': entry.published,
                'location': '',
                'lat': 0,
                'lon': 0,
                'accident': classify_ynet(entry.title),
                'source': 'ynet'
            }
            process.crawl(YnetFlashScrap,
                          entry.links[0].href,
                          news_item=news_item,
                          maps_key=maps_key)
    process.start()
Пример #3
0
def extract_geo_features(parsed_item, google_maps_key):
    news_item = {**init_news_item_extracted_features(), **parsed_item}
    location = None
    news_item['accident'] = classify_ynet(news_item['title'])
    try:
        if news_item['accident']:
            if news_item['description'] is not None:
                location = manual_filter_location_of_text(
                    news_item['description'])
            if location is None:
                location = manual_filter_location_of_text(news_item['title'])
            news_item['location'] = location
            geo_location = geocode_extract(location, google_maps_key)
            if geo_location is not None:
                news_item['lat'] = geo_location['geom']['lat']
                news_item['lon'] = geo_location['geom']['lng']
                news_item['resolution'] = set_accident_resolution(geo_location)
                db_location = get_db_matching_location(news_item['lat'],
                                                       news_item['lon'],
                                                       news_item['resolution'],
                                                       geo_location['road_no'])
                for col in [
                        'region_hebrew', 'district_hebrew', 'yishuv_name',
                        'street1_hebrew', 'street2_hebrew',
                        'non_urban_intersection_hebrew', 'road1', 'road2',
                        'road_segment_name'
                ]:
                    news_item[col] = db_location[col]
    except Exception as _:
        pass
    return news_item
Пример #4
0
def extract_geo_features(parsed_item, google_maps_key):
    news_item = {**init_news_item_extracted_features(), **parsed_item}
    location = None
    news_item["accident"] = classify_ynet(news_item["title"])
    try:
        if news_item["accident"]:
            if news_item["description"] is not None:
                location = manual_filter_location_of_text(
                    news_item["description"])
            if location is None:
                location = manual_filter_location_of_text(news_item["title"])
            news_item["location"] = location
            geo_location = geocode_extract(location, google_maps_key)
            if geo_location is not None:
                news_item["lat"] = geo_location["geom"]["lat"]
                news_item["lon"] = geo_location["geom"]["lng"]
                news_item["resolution"] = set_accident_resolution(geo_location)
                db_location = get_db_matching_location(
                    news_item["lat"],
                    news_item["lon"],
                    news_item["resolution"],
                    geo_location["road_no"],
                )
                for col in [
                        "region_hebrew",
                        "district_hebrew",
                        "yishuv_name",
                        "street1_hebrew",
                        "street2_hebrew",
                        "non_urban_intersection_hebrew",
                        "road1",
                        "road2",
                        "road_segment_name",
                ]:
                    news_item[col] = db_location[col]
    except Exception as _:
        pass
    return news_item