class BasicCliffTest(unittest.TestCase): # A basic set of test cases to make sure the API can pull from the server correctly. def setUp(self): self._url = os.getenv("CLIFF_URL") self._cliff = Cliff(self._url) def test_parse_text(self): results = self._cliff.parse_text( "This is about Einstien at the IIT in New Delhi.") results = results['results'] print(results) self.assertEqual(len(results['organizations']), 1) self.assertEqual(len(results['places']['mentions']), 1) self.assertEqual(results['places']['mentions'][0]['id'], 1261481) self.assertEqual(len(results['people']), 1) def test_extract_content(self): test_url = "https://www.foxnews.com/us/temple-university-stands-by-marc-lamont-hill-after-cnn-fires-him-for-anti-israel-remarks" results = self._cliff.extract_content(test_url) results = results['results'] self.assertEqual(test_url, results['url']) self.assertTrue(len(results['text']) > 100) def test_geonames_lookup(self): results = self._cliff.geonames_lookup(4943351) self.assertEqual(results['id'], 4943351) self.assertEqual(results['lon'], -71.09172) self.assertEqual(results['lat'], 42.35954) self.assertEqual(results['name'], "Massachusetts Institute of Technology") self.assertEqual(results['parent']['name'], "City of Cambridge") self.assertEqual(results['parent']['parent']['name'], "Middlesex County") self.assertEqual(results['parent']['parent']['parent']['name'], "Massachusetts") self.assertEqual( results['parent']['parent']['parent']['parent']['name'], "United States") def test_local_replacements(self): replacements = { 'Londonderry': 'London', } # make sure non-replaced fetches the city in the UK results = self._cliff.parse_text("This is about London.")['results'] mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDON_UK, mention['id']) # now see if it gets the city with replacements replacing_cliff = Cliff(self._url, text_replacements=replacements) results = replacing_cliff.parse_text( "This is about London.")['results'] replaced_mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
def test_local_replacements(self): replacements = { 'Londonderry': 'London', } # make sure non-replaced fetches the city in the UK results = self._cliff.parse_text("This is about London.")['results'] mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDON_UK, mention['id']) # now see if it gets the city with replacements replacing_cliff = Cliff(self._url, text_replacements=replacements) results = replacing_cliff.parse_text( "This is about London.")['results'] replaced_mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
def clavin(self): my_cliff = Cliff('http://localhost:8080') dictionary = {} while True: try: dictionary = my_cliff.parse_text(self.body_page) break except: print("Clavin Docker not running or link not valid", '\n') logging.error("Clavin Docker not running or link not valid") break json_object = json.dumps(dictionary, indent=4) with open("clavin.json", "w") as outfile: outfile.write(json_object) logging.info("Clavin JSON file written") with open('clavin.json') as fi: # with open('sample.json') as fi: self.d = json.load(fi) if not self.d: logging.error("Clavin JSON File Empty")
handler = SentryHandler(config.get('SENTRY_DSN')) handler.setLevel(logging.ERROR) setup_logging(handler) except ConfigException as e: logger.info("no sentry logging") # Connect to MediaCloud TOOL_API_KEY = config.get('MEDIA_CLOUD_API_KEY') mc = mediacloud.api.AdminMediaCloud(TOOL_API_KEY) logger.info("Connected to mediacloud") # Connect to CLIFF if the settings are there cliff = None try: cliff = Cliff(config.get('CLIFF_URL')) except KeyError as e: logger.warning("no CLIFF connection") NYT_THEME_LABELLER_URL = config.get('NYT_THEME_LABELLER_URL') # Connect to the app's mongo DB try: user_db = UserDatabase(config.get('MONGO_URL')) analytics_db = AnalyticsDatabase(config.get('MONGO_URL')) user_db.check_connection() logger.info("Connected to DB: {}".format(config.get('MONGO_URL'))) except Exception as err: logger.error("DB error: {0}".format(err)) logger.exception(err) sys.exit()
# Step 2: takes untagged messages and splits them by sentence and then looks for location identifiers. put all statements w locations into new sheet. from cliff.api import Cliff import pandas as pd import geoip2.database import re reader = geoip2.database.Reader("../GeoLite2-City_20210202/GeoLite2-City.mmdb") my_cliff = Cliff('http://localhost:8080') file_name = "../processedData/messages.xlsx" # path to file + file name sheet = "Sheet1" # sheet name or sheet number or list of sheet numbers and names df = pd.read_excel(io=file_name, sheet_name=sheet) excel_data = [] check_repeat = [] for index, row in df.iterrows(): parsed_row = re.split('[?.:]', row['message']) for sentence in parsed_row: if (len(sentence.split()) < 4 and len(sentence.strip()) > 2): if (sentence.strip() not in check_repeat): temp_data = {} check_repeat.append(sentence.strip()) result = my_cliff.parse_text(sentence) try: targets = result['results']['places']['focus'] if targets != {}: # message, author temp_data['author'] = row['author'] temp_data['message'] = sentence.strip()
def extract_locaiton_info(text): my_cliff = Cliff(cliff_server_addr) print(my_cliff.parse_text(text)) print(my_cliff.geonames_lookup(4943351))
def get_cliff_client(): return Cliff(CLIFF_URL)
def setUp(self): self._url = os.getenv("CLIFF_URL") self._cliff = Cliff(self._url)
from cliff.api import Cliff import json from pprint import pprint my_cliff = Cliff("http://10.176.148.84:8080") def extract_location(tweet): location = {} focus = my_cliff.parse_text(tweet)['results']['places']['focus'] for key, value in focus.items(): location[key] = [item['name'] for item in value] return location def readFile(fileName): with open(fileName, 'r') as f: d = json.load(f) f.close() return d def writeFile(fileName, data): with open(fileName, 'w') as f: json.dump(data, f) f.close() if __name__ == '__main__':
MAX_CHARS = 250 # limit the amount of text users can send in app = Flask(__name__) # setup logging logging.basicConfig(level=logging.WARN) log = logging.getLogger(__file__) log.info( "---------------------------------------------------------------------------" ) app_config = config.get_default_config() # set up the api client we will use CLIFF_URL = app_config.get('CLIFF_URL') cliff = Cliff(CLIFF_URL) cliff.PARSE_TEXT_PATH = "/cliff/parse/text" # instead of "/cliff-2.6.1/parse/text" # render the homepage @app.route("/") def index(): return render_template('home.html', version=VERSION) # return json results from CLIFF @app.route("/process", methods=['POST']) def geoparse(): text = request.form['text'] language = request.form['language'] demonyms = request.form['demonyms'] == 'true'
logging.basicConfig() # import required modules from cliff.api import Cliff import json import numpy as np import pandas as pd from pandas.io.json import json_normalize # read in abstracts scraped_abstracts = pd.read_csv( 'C:/Users/joeym/Documents/PhD/Aims/Aim 1 - collate pollinator knowledge/Outputs/scrape_abs/cleaned/for_geoparse/04_animal-species_abs_1-2-cleaned-for-geoparse.csv' ) # assign the localhost address to my_cliff my_cliff = Cliff('http://localhost:8999') # result object to append to result = [] # index for abstract object abstract = scraped_abstracts['abstract'] # index for title object EID = scraped_abstracts['EID'] # loop through abstracts for i in range(0, len(abstract)): try:
logging.basicConfig( filename=os.path.join(base_dir, 'worker.log'), level=logging.INFO, format='%(asctime)s | %(levelname)s | %(name)s | %(message)s') logger = logging.getLogger(__name__) logger.info( "------------------------------------------------------------------------") logger.info("Starting up Geocoding Worker v{}".format(VERSION)) config = get_default_config() BROKER_URL = config.get('BROKER_URL') logger.info("BROKER_URL: {}".format(BROKER_URL)) MC_API_KEY = config.get('MC_API_KEY') mc = mediacloud.api.AdminMediaCloud(MC_API_KEY) logger.info("MC_API_KEY: {}".format(MC_API_KEY)) CLIFF_URL = config.get('CLIFF_URL') cliff = Cliff(CLIFF_URL) logger.info("CLIFF_URL: {}".format(CLIFF_URL)) try: SENTRY_DSN = config.get('SENTRY_DSN') logger.info("SENTRY_DSN: {}".format(SENTRY_DSN)) handler = SentryHandler(SENTRY_DSN) handler.setLevel(logging.WARN) setup_logging(handler) except ConfigException: logger.info("No logging to sentry")