def test_find_wikipedia_url(self): e = EntityPlus.objects.create(type='politician', name="foo") a = e.aliases.create(alias='Barack Obama') e.politician_metadata_for_latest_cycle = PoliticianMetadataLatest( state='', party='D', seat='federal:president', cycle=2010, seat_status='open', seat_result='win') self.assertEqual( find_wikipedia_url(e)[0], 'http://en.wikipedia.org/wiki/Barack_Obama') e = EntityPlus.objects.create(type='organization', name="foo1") a = e.aliases.create(alias='Atlantic Richfield') self.assertEqual( find_wikipedia_url(e)[0], 'http://en.wikipedia.org/wiki/ARCO') e = EntityPlus.objects.create(type='organization', name="foo2") a = e.aliases.create(alias='No WP entry for this') self.assertEqual(find_wikipedia_url(e), ['', '']) e = EntityPlus.objects.create(type='organization', name="foo4") a = e.aliases.create(alias='159 Group') self.assertEqual(find_wikipedia_url(e), ['', '']) e = EntityPlus.objects.create(type='organization', name="foo5") a = e.aliases.create(alias='188 Claremont') self.assertEqual(find_wikipedia_url(e), ['', ''])
def handle(self, *args, **kwargs): if len(args) == 0: print \ """Requires at least one CSV file argument, which should contain at least two columns: <entity id>, <wikipedia url or ''> """ return self.coded = {} for csv_file in args: with open(csv_file) as fh: reader = csv.reader(fh) for row in reader: self.coded[row[0]] = row[1] self.results = {} self.entities = {} for eid, url in self.coded.iteritems(): # Annotate the entity with its total from the agg_entities table. entity = EntityPlus.objects.raw(""" SELECT m.*, GREATEST(a.contributor_amount, a.recipient_amount) AS total FROM matchbox_entity m LEFT JOIN agg_entities a ON m.id=a.entity_id WHERE a.cycle=-1 AND m.id = %s """, [eid])[0] self.results[eid] = find_wikipedia_url(entity)[0] self.entities[eid] = entity self.pretty_print_results()
def handle(self, *args, **kwargs): if len(args) == 0: print \ """Requires at least one CSV file argument, which should contain at least two columns: <entity id>, <wikipedia url or ''> """ return self.coded = {} for csv_file in args: with open(csv_file) as fh: reader = csv.reader(fh) for row in reader: self.coded[row[0]] = row[1] self.results = {} self.entities = {} for eid, url in self.coded.iteritems(): # Annotate the entity with its total from the agg_entities table. entity = EntityPlus.objects.raw( """ SELECT m.*, GREATEST(a.contributor_amount, a.recipient_amount) AS total FROM matchbox_entity m LEFT JOIN agg_entities a ON m.id=a.entity_id WHERE a.cycle=-1 AND m.id = %s """, [eid])[0] self.results[eid] = find_wikipedia_url(entity)[0] self.entities[eid] = entity self.pretty_print_results()
def test_find_wikipedia_url(self): e = EntityPlus.objects.create(type='politician', name="foo") a = e.aliases.create(alias='Barack Obama') e.politician_metadata_for_latest_cycle = PoliticianMetadataLatest(state='', party='D', seat='federal:president', cycle=2010, seat_status='open', seat_result='win') self.assertEqual(find_wikipedia_url(e)[0], 'http://en.wikipedia.org/wiki/Barack_Obama') e = EntityPlus.objects.create(type='organization', name="foo1") a = e.aliases.create(alias='Atlantic Richfield') self.assertEqual(find_wikipedia_url(e)[0], 'http://en.wikipedia.org/wiki/ARCO') e = EntityPlus.objects.create(type='organization', name="foo2") a = e.aliases.create(alias='No WP entry for this') self.assertEqual(find_wikipedia_url(e), ['', '']) e = EntityPlus.objects.create(type='organization', name="foo4") a = e.aliases.create(alias='159 Group') self.assertEqual(find_wikipedia_url(e), ['', '']) e = EntityPlus.objects.create(type='organization', name="foo5") a = e.aliases.create(alias='188 Claremont') self.assertEqual(find_wikipedia_url(e), ['', ''])
def handle(self, *args, **kwargs): if len(args) < 2: print "Requires two arguments: <error_file> <output_file>" print print self.help return error_file = args[0] output_file = args[1] # Retrieve previous results results = {} if os.path.exists(output_file): with open(output_file) as fh: reader = utils.UnicodeReader(fh) for row in reader: results[row[0]] = row # Fetch and record new results with open(output_file, 'w') as fh: writer = utils.UnicodeWriter(fh) for eid, row in results.iteritems(): writer.writerow(row) eids = [] with open(error_file) as fh: error_reader = utils.UnicodeReader(fh) for row in error_reader: eids.append(row[0]) for eid in eids: if eid in results: continue entity = EntityPlus.objects.get(id=eid) print print "Attempting:", eid, entity.name # Allow errors to be thrown. wp = find_wikipedia_url(entity) or ['', '', ''] url, excerpt, image_url = wp row = (entity.id, url, image_url, excerpt, unicode(datetime.datetime.now())) results[row[0]] = row writer.writerow(row) print " %i/%i" % (len(results), len(eids)), print(entity.name, row[1])
def handle(self, *args, **kwargs): if len(args) < 2: print "Requires two arguments: <error_file> <output_file>" print print self.help return error_file = args[0] output_file = args[1] # Retrieve previous results results = {} if os.path.exists(output_file): with open(output_file) as fh: reader = utils.UnicodeReader(fh) for row in reader: results[row[0]] = row # Fetch and record new results with open(output_file, "w") as fh: writer = utils.UnicodeWriter(fh) for eid, row in results.iteritems(): writer.writerow(row) eids = [] with open(error_file) as fh: error_reader = utils.UnicodeReader(fh) for row in error_reader: eids.append(row[0]) for eid in eids: if eid in results: continue entity = EntityPlus.objects.get(id=eid) print print "Attempting:", eid, entity.name # Allow errors to be thrown. wp = find_wikipedia_url(entity) or ["", "", ""] url, excerpt, image_url = wp row = (entity.id, url, image_url, excerpt, unicode(datetime.datetime.now())) results[row[0]] = row writer.writerow(row) print " %i/%i" % (len(results), len(eids)), print (entity.name, row[1])