예제 #1
0
 def _basic_params(self):
     self.name = "extract"
     self.pdftotext = self.extra_args[0]
     self.force_update = False
     if len(self.extra_args) > 1:
         self.force_update = True
     jarLoad = classPathHacker()
     tikaPath = os.path.join(self.cwd, "lib", "tika-app-1.2.jar")
     if os.path.exists(tikaPath):
         jarLoad.addFile(tikaPath)
         from org.apache.tika import Tika
         self.tika = Tika()
예제 #2
0
 def _basic_params(self):
     self.name = 'extract'
     self.pdftotext = self.extra_args[0]
     self.force_update = False
     if len(self.extra_args) > 1:
         self.force_update = True
     jarLoad = classPathHacker()
     tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar')
     if os.path.exists(tikaPath):
         jarLoad.addFile(tikaPath)
         from org.apache.tika import Tika
         self.tika = Tika()
예제 #3
0
    def _basic_params(self):
        self.name = 'extract'
        self.pdftotext = self.extra_args[0]
        if not os.path.exists(self.pdftotext):
            logging.error('pdftotext not found!')
            sys.exit(1)

        self.force_update = False
        if len(self.extra_args) > 1:
            self.force_update = True
        jarLoad = classPathHacker()
        tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar')
        if os.path.exists(tikaPath):
            jarLoad.addFile(tikaPath)
            from org.apache.tika import Tika
            self.tika = Tika()
예제 #4
0
	def run_geoparser(self):
		import __builtin__
		jarLoad = classPathHacker()
		sqlitePath = os.path.join(self.cwd, "lib", "geodict", "sqlite-jdbc-3.7.2.jar")
		jarLoad.addFile(sqlitePath)

		import lib.geodict.geodict_config

		self.database_path = os.path.join(self.cwd, "lib", "geodict", "geodict.db")

		from lib.geodict.geodict_lib import GeodictParser

		geo_parsed = {}
		places_by_entityURI = {}

		self.cache_filename = os.path.join(self.out_dir, "geoparser.cache")
		if os.path.exists(self.cache_filename):
			self.cache = json.load(file(self.cache_filename))
		else:
			self.cache = {}

		for filename in self.files:
			logging.info("processing " + filename)
			self.update_progress()

			file_geoparsed = filename.replace(".txt", "_geoparse.json")
			contexts_json = filename.replace(".txt", "_contexts.json")

			if os.path.exists(file_geoparsed):
				try:
					geoparse_obj = json.load(file(file_geoparsed))
					if "places_by_entityURI" in geoparse_obj:
						if not os.path.exists(contexts_json):
							self.contexts_from_geoparse_obj(geoparse_obj, filename)
						continue
					else:
						os.remove(file_geoparsed)
				except:
					logging.error("File " + file_geoparsed + " could not be read.")
					logging.error(traceback.format_exc())

			if not self.dry_run:
				geoparse_obj = {'places_by_entityURI': {}, 'references': {}}
				try:
					id = self.metadata[filename]['itemID']
					str_to_parse = self.metadata[filename]['place']
					last_index = len(str_to_parse)
					str_to_parse += codecs.open(filename, 'rU', encoding='utf8').read()

					city = None
					places = set()
					
					json_filename = filename.replace('.txt', '_geodict.json')

					if not os.path.exists(json_filename):
						parser = GeodictParser(self.database_path)
						places_found = list(self.get_places(str_to_parse, parser.find_locations_in_text))
						with codecs.open(json_filename, 'w', encoding='utf8') as json_file:
							json.dump(places_found, json_file)
					else:
						with codecs.open(json_filename, 'r', encoding='utf8') as json_file:
							places_found = json.load(json_file)

					for (place, reference) in places_found:
						entityURI = place["entityURI"]
						geoparse_obj['places_by_entityURI'][entityURI] = {'name': place["name"], 'type': place["type"], 'coordinates': [place["longitude"], place["latitude"]]}

						if reference[0] < last_index:
							city = entityURI
						else:
							places.add(entityURI)
							if not entityURI in geoparse_obj['references']:
								geoparse_obj['references'][entityURI] = []
							geoparse_obj['references'][entityURI].append((reference[0] - last_index, reference[1] - last_index))

					if city is None and self.metadata[filename]['place'] != "":
						try:
							query_str = self.metadata[filename]['place']
							if query_str in self.cache:
								result = self.cache.get(query_str)
								if result is not None:
									geoparse_obj['places_by_entityURI'][result["entityURI"]] = {'name': result["name"], 'type': result["fcodeName"], 'coordinates': [result["lng"], result["lat"]]}
									places.add(result["entityURI"])
									city = result["entityURI"]
							else:
								search_for = {"q": query_str}
								query_url = "http://ws.geonames.org/searchJSON?%s" % urllib.urlencode(search_for)
								result_obj = json.load(urllib2.urlopen(query_url))
								result_places = result_obj.get("geonames", [])
								if len(result_places) > 0:
									result_place = result_places[0]
									self.cache[query_str] = result_place
									self.cache[query_str].update({"entityURI": "http://sws.geonames.org/" + str(result_place.get("geonameId")) })
									result = self.cache[query_str]
									geoparse_obj['places_by_entityURI'][result["entityURI"]] = {'name': result["name"], 'type': result["fcodeName"], 'coordinates': [result["lng"], result["lat"]]}
									places.add(result["entityURI"])
									city = result["entityURI"]
								else:
									self.cache[query_str] = None
								json.dump(self.cache, file(self.cache_filename, 'w'))
						except:
							logging.error("No city found for %s" % id)
							logging.error(traceback.format_exc())

					geoparse_obj['places'] = list(places)
					geoparse_obj['city'] = city
					with file(file_geoparsed, 'w') as f:
						json.dump(geoparse_obj, f)
					if not os.path.exists(contexts_json):
						self.contexts_from_geoparse_obj(geoparse_obj, filename)
					time.sleep(0.2)
				except (KeyboardInterrupt, SystemExit):
					raise
				except:
					logging.error(traceback.format_exc())

			geo_parsed[filename] = geoparse_obj.get('places', [])
			self.metadata[filename]['city'] = geoparse_obj.get('city')
			for entityURI, data in geoparse_obj.get('places_by_entityURI', {}).iteritems():
				places_by_entityURI[entityURI] = data

		places = {}
		for filename, entityURIs in geo_parsed.iteritems():
			year = self.metadata[filename]["year"]
			for entityURI in entityURIs:
				if entityURI in places_by_entityURI:
					if entityURI not in places:
						places[entityURI] = {}
						places[entityURI]["name"] = places_by_entityURI[entityURI]["name"]
						places[entityURI]["type"] = places_by_entityURI[entityURI]["type"]
						places[entityURI]["coordinates"] = places_by_entityURI[entityURI]["coordinates"]
						places[entityURI]["weight"] = {year: 1}
					else:
						if year not in places[entityURI]["weight"]:
							places[entityURI]["weight"][year] = 1
						else:
							places[entityURI]["weight"][year] += 1
		self.geo_parsed = geo_parsed
		self.places = places
		self.places_by_entityURI = places_by_entityURI
예제 #5
0
    def run_geoparser(self):
        import __builtin__
        jarLoad = classPathHacker()
        sqlitePath = os.path.join(self.cwd, 'lib', 'geodict',
                                  'sqlite-jdbc-3.7.2.jar')
        jarLoad.addFile(sqlitePath)

        import lib.geodict.geodict_config

        self.database_path = os.path.join(self.cwd, 'lib', 'geodict',
                                          'geodict.db')

        from lib.geodict.geodict_lib import GeodictParser

        geo_parsed = {}
        places_by_entityURI = {}

        self.cache_filename = os.path.join(self.out_dir, 'geoparser.cache')
        if os.path.exists(self.cache_filename):
            self.cache = json.load(file(self.cache_filename))
        else:
            self.cache = {}

        for filename in self.files:
            logging.info('processing ' + filename)
            self.update_progress()

            file_geoparsed = filename.replace('.txt', '_geoparse.json')
            contexts_json = filename.replace('.txt', '_contexts.json')

            if os.path.exists(file_geoparsed):
                try:
                    geoparse_obj = json.load(file(file_geoparsed))
                    if 'places_by_entityURI' in geoparse_obj:
                        if not os.path.exists(contexts_json):
                            self.contexts_from_geoparse_obj(
                                geoparse_obj, filename)
                        continue
                    else:
                        os.remove(file_geoparsed)
                except:
                    logging.error('File ' + file_geoparsed +
                                  ' could not be read.')
                    logging.error(traceback.format_exc())

            if not self.dry_run:
                geoparse_obj = {'places_by_entityURI': {}, 'references': {}}
                try:
                    itemid = self.metadata[filename]['itemID']
                    str_to_parse = self.metadata[filename]['place']
                    last_index = len(str_to_parse)
                    str_to_parse += codecs.open(filename,
                                                'rU',
                                                encoding='utf8').read()

                    city = None
                    places = set()

                    json_filename = filename.replace('.txt', '_geodict.json')

                    if not os.path.exists(json_filename):
                        parser = GeodictParser(self.database_path)
                        places_found = \
                            list(self.get_places(str_to_parse,
                                 parser.find_locations_in_text))

                        with codecs.open(json_filename, 'w',
                                         encoding='utf8') as json_file:
                            json.dump(places_found, json_file)
                    else:
                        with codecs.open(json_filename, 'r',
                                         encoding='utf8') as json_file:
                            places_found = json.load(json_file)

                    for (place, reference) in places_found:
                        entityURI = place['entityURI']
                        geoparse_obj['places_by_entityURI'][entityURI] = {
                            'name': place['name'],
                            'type': place['type'],
                            'coordinates':
                            [place['longitude'], place['latitude']]
                        }

                        if reference[0] < last_index:
                            city = entityURI
                        else:
                            places.add(entityURI)
                            if not entityURI \
                                in geoparse_obj['references']:
                                geoparse_obj['references'][entityURI] = \
                                    []
                            geoparse_obj['references'][entityURI].append(
                                (reference[0] - last_index,
                                 reference[1] - last_index))

                    if city is None and self.metadata[filename]['place'] != '':
                        try:
                            query_str = self.metadata[filename]['place']
                            place_dict = \
                                geoparse_obj['places_by_entityURI']

                            if query_str in self.cache:
                                result = self.cache.get(query_str)
                                if result is not None:
                                    place_dict[result['entityURI']] = \
    {'name': result['name'], 'type': result['fcodeName'],
                                    'coordinates': [result['lng'], result['lat']]}
                                    places.add(result['entityURI'])
                                    city = result['entityURI']
                            else:
                                search_for = {'q': query_str}
                                query_url = 'http://ws.geonames.org/' \
                                    + 'searchJSON?%s' \
                                    % urllib.urlencode(search_for)
                                result_obj = \
                                    json.load(urllib2.urlopen(query_url))
                                result_places = \
                                    result_obj.get('geonames', [])
                                if len(result_places) > 0:
                                    result_place = result_places[0]
                                    self.cache[query_str] = result_place
                                    result = self.cache[query_str]
                                    uri = 'http://sws.geonames.org/' \
    + str(result_place.get('geonameId'))
                                    result['entityURI'] = uri
                                    place_dict[uri] = \
    {'name': result['name'], 'type': result['fcodeName'],
                                    'coordinates': [result['lng'], result['lat']]}
                                    places.add(uri)
                                    city = uri
                                else:
                                    self.cache[query_str] = None
                                with file(self.cache_filename, 'w') as \
                                    cache_f:
                                    json.dump(self.cache, cache_f)
                        except:
                            logging.error('No city found for %s' % itemid)
                            logging.error(traceback.format_exc())

                    geoparse_obj['places'] = list(places)
                    geoparse_obj['city'] = city
                    with file(file_geoparsed, 'w') as f:
                        json.dump(geoparse_obj, f)
                    if not os.path.exists(contexts_json):
                        self.contexts_from_geoparse_obj(geoparse_obj, filename)
                    time.sleep(0.2)
                except (KeyboardInterrupt, SystemExit):
                    raise
                except:
                    logging.error(traceback.format_exc())

            geo_parsed[filename] = geoparse_obj.get('places', [])
            self.metadata[filename]['city'] = geoparse_obj.get('city')
            place_dict = geoparse_obj.get('places_by_entityURI', {})
            for (entityURI, data) in place_dict.iteritems():
                places_by_entityURI[entityURI] = data

        places = {}
        for (filename, entityURIs) in geo_parsed.iteritems():
            year = self.metadata[filename]['year']
            for entityURI in entityURIs:
                if entityURI in places_by_entityURI:
                    entity = places_by_entityURI[entityURI]
                    if entityURI not in places:
                        places[entityURI] = {}
                        new_entity = places[entityURI]
                        new_entity['name'] = entity['name']
                        new_entity['type'] = entity['type']
                        new_entity['coordinates'] = entity['coordinates']
                        new_entity['weight'] = {year: 1}
                    else:
                        if year not in places[entityURI]['weight']:
                            places[entityURI]['weight'][year] = 1
                        else:
                            places[entityURI]['weight'][year] += 1
        self.geo_parsed = geo_parsed
        self.places = places
        self.places_by_entityURI = places_by_entityURI
예제 #6
0
    def run_geoparser(self):
        import __builtin__
        jarLoad = classPathHacker()
        sqlitePath = os.path.join(self.cwd, 'lib', 'geodict',
                                  'sqlite-jdbc-3.7.2.jar')
        jarLoad.addFile(sqlitePath)

        import lib.geodict.geodict_config

        self.database_path = os.path.join(self.cwd, 'lib', 'geodict',
                'geodict.db')

        from lib.geodict.geodict_lib import GeodictParser

        geo_parsed = {}
        places_by_entityURI = {}

        self.cache_filename = os.path.join(self.out_dir,
                'geoparser.cache')
        if os.path.exists(self.cache_filename):
            self.cache = json.load(file(self.cache_filename))
        else:
            self.cache = {}

        for filename in self.files:
            logging.info('processing ' + filename)
            self.update_progress()

            file_geoparsed = filename.replace('.txt', '_geoparse.json')
            contexts_json = filename.replace('.txt', '_contexts.json')

            if os.path.exists(file_geoparsed):
                try:
                    geoparse_obj = json.load(file(file_geoparsed))
                    if 'places_by_entityURI' in geoparse_obj:
                        if not os.path.exists(contexts_json):
                            self.contexts_from_geoparse_obj(geoparse_obj,
                                    filename)
                        continue
                    else:
                        os.remove(file_geoparsed)
                except:
                    logging.error('File ' + file_geoparsed
                                  + ' could not be read.')
                    logging.error(traceback.format_exc())

            if not self.dry_run:
                geoparse_obj = {'places_by_entityURI': {},
                                'references': {}}
                try:
                    itemid = self.metadata[filename]['itemID']
                    str_to_parse = self.metadata[filename]['place']
                    last_index = len(str_to_parse)
                    str_to_parse += codecs.open(filename, 'rU',
                            encoding='utf8').read()

                    city = None
                    places = set()

                    json_filename = filename.replace('.txt',
                            '_geodict.json')

                    if not os.path.exists(json_filename):
                        parser = GeodictParser(self.database_path)
                        places_found = \
                            list(self.get_places(str_to_parse,
                                 parser.find_locations_in_text))

                        with codecs.open(json_filename, 'w',
                                encoding='utf8') as json_file:
                            json.dump(places_found, json_file)
                    else:
                        with codecs.open(json_filename, 'r',
                                encoding='utf8') as json_file:
                            places_found = json.load(json_file)

                    for (place, reference) in places_found:
                        entityURI = place['entityURI']
                        geoparse_obj['places_by_entityURI'
                                ][entityURI] = {'name': place['name'],
                                'type': place['type'],
                                'coordinates': [place['longitude'],
                                place['latitude']]}

                        if reference[0] < last_index:
                            city = entityURI
                        else:
                            places.add(entityURI)
                            if not entityURI \
                                in geoparse_obj['references']:
                                geoparse_obj['references'][entityURI] = \
                                    []
                            geoparse_obj['references'
                                    ][entityURI].append((reference[0]
                                    - last_index, reference[1]
                                    - last_index))

                    if city is None and self.metadata[filename]['place'
                            ] != '':
                        try:
                            query_str = self.metadata[filename]['place']
                            place_dict = \
                                geoparse_obj['places_by_entityURI']

                            if query_str in self.cache:
                                result = self.cache.get(query_str)
                                if result is not None:
                                    place_dict[result['entityURI']] = \
    {'name': result['name'], 'type': result['fcodeName'],
     'coordinates': [result['lng'], result['lat']]}
                                    places.add(result['entityURI'])
                                    city = result['entityURI']
                            else:
                                search_for = {'q': query_str}
                                query_url = 'http://ws.geonames.org/' \
                                    + 'searchJSON?%s' \
                                    % urllib.urlencode(search_for)
                                result_obj = \
                                    json.load(urllib2.urlopen(query_url))
                                result_places = \
                                    result_obj.get('geonames', [])
                                if len(result_places) > 0:
                                    result_place = result_places[0]
                                    self.cache[query_str] = result_place
                                    result = self.cache[query_str]
                                    uri = 'http://sws.geonames.org/' \
    + str(result_place.get('geonameId'))
                                    result['entityURI'] = uri
                                    place_dict[uri] = \
    {'name': result['name'], 'type': result['fcodeName'],
     'coordinates': [result['lng'], result['lat']]}
                                    places.add(uri)
                                    city = uri
                                else:
                                    self.cache[query_str] = None
                                with file(self.cache_filename, 'w') as \
                                    cache_f:
                                    json.dump(self.cache, cache_f)
                        except:
                            logging.error('No city found for %s'
                                    % itemid)
                            logging.error(traceback.format_exc())

                    geoparse_obj['places'] = list(places)
                    geoparse_obj['city'] = city
                    with file(file_geoparsed, 'w') as f:
                        json.dump(geoparse_obj, f)
                    if not os.path.exists(contexts_json):
                        self.contexts_from_geoparse_obj(geoparse_obj,
                                filename)
                    time.sleep(0.2)
                except (KeyboardInterrupt, SystemExit):
                    raise
                except:
                    logging.error(traceback.format_exc())

            geo_parsed[filename] = geoparse_obj.get('places', [])
            self.metadata[filename]['city'] = geoparse_obj.get('city')
            place_dict = geoparse_obj.get('places_by_entityURI', {})
            for (entityURI, data) in place_dict.iteritems():
                places_by_entityURI[entityURI] = data

        places = {}
        for (filename, entityURIs) in geo_parsed.iteritems():
            year = self.metadata[filename]['year']
            for entityURI in entityURIs:
                if entityURI in places_by_entityURI:
                    entity = places_by_entityURI[entityURI]
                    if entityURI not in places:
                        places[entityURI] = {}
                        new_entity = places[entityURI]
                        new_entity['name'] = entity['name']
                        new_entity['type'] = entity['type']
                        new_entity['coordinates'] = entity['coordinates'
                                ]
                        new_entity['weight'] = {year: 1}
                    else:
                        if year not in places[entityURI]['weight']:
                            places[entityURI]['weight'][year] = 1
                        else:
                            places[entityURI]['weight'][year] += 1
        self.geo_parsed = geo_parsed
        self.places = places
        self.places_by_entityURI = places_by_entityURI