def main(): metautils.setsettings(settings) cur = metautils.getDBCursor(settings) #Get cities to search cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s', (True, )) bres = cur.fetchall() print 'domain:incommon:new:binglost' for row in bres: citydata = find_data(row[0], row[1]) citydict = {} for result in citydata: citydict[result['URL_Datei']] = result bingset = set(citydict.keys()) allset = set(citydict.keys()) cur = metautils.getDBCursor(settings) cur.execute('SELECT url FROM data WHERE source=%s AND city=%s', ('b', row[0])) dbset = set() for dbres in cur.fetchall(): dbset.add(dbres[0]) allset.add(dbres[0]) #Analysis intersection = dbset.intersection(bingset) dbnot = allset.difference(dbset) bingnot = allset.difference(bingset) records = [] for urlkey in dbnot: therow = citydict[urlkey] #In this case, we can safely assign it directly therow['URL'] = therow['URL_Datei'] #Likewise, there cannot be any filenames therow['filenames'] = [] metautils.convert_crawl_row(therow, 'b') records.append(therow) print row[1] + ':' + str(len(intersection)) + ':' + str( len(dbnot)) + ':' + str(len(bingnot)) #Write to DB metautils.addCrawlDataToDB( records) #Checked and accepted are both false by default
def main(): metautils.setsettings(settings) cur = metautils.getDBCursor(settings) #Get cities to search cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s', (True,)) bres = cur.fetchall() print 'domain:incommon:new:binglost' for row in bres: citydata = find_data(row[0], row[1]) citydict = {} for result in citydata: citydict[result['URL_Datei']] = result bingset = set(citydict.keys()) allset = set(citydict.keys()) cur = metautils.getDBCursor(settings) cur.execute('SELECT url FROM data WHERE source=%s AND city=%s', ('b', row[0])) dbset = set() for dbres in cur.fetchall(): dbset.add(dbres[0]) allset.add(dbres[0]) #Analysis intersection = dbset.intersection(bingset) dbnot = allset.difference(dbset) bingnot = allset.difference(bingset) records = [] for urlkey in dbnot: therow = citydict[urlkey] #In this case, we can safely assign it directly therow['URL'] = therow['URL_Datei'] #Likewise, there cannot be any filenames therow['filenames'] = [] metautils.convert_crawl_row(therow, 'b') records.append(therow) print row[1] + ':' + str(len(intersection)) + ':' + str(len(dbnot)) + ':' + str(len(bingnot)) #Write to DB metautils.addCrawlDataToDB(records) #Checked and accepted are both false by default
import metautils from dbsettings import settings SPIDER_MODULES = ['dirbot.spiders'] NEWSPIDER_MODULE = 'dirbot.spiders' DEFAULT_ITEM_CLASS = 'dirbot.items.Website' DOWNLOADER_MIDDLEWARES = { 'dirbot.middleware.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': None, } ROBOTSTXT_OBEY = True GENERAL_BLACKLIST = ('.pdf', 'kontakt/', 'kontakt.', 'veranstaltungskalender', 'veranstaltungen', 'font=', 'print=', 'style=', 'font_size=') #Old - not deleting as this needs to be transferred to the DB as far as possible #ROBOTSTXT_BLACKLIST = ('urban.gera.de', 'extranet.iserlohn.de', 'timestamp=', 'textmodus=&textmodu', 'modul=druckansicht', 'switchtodate', 'dienstleistungen.php', 'siegen.de/vereinsregister', 'mitarbeiter/mitarbeiter.php', 'dienstleistungen/formular.php', 'events/list.php', 'lexikon/index.php', '.krebs/karte', 'cottbus.de/opt', 'cottbus.de/abfrage', 'sixcms/%20/sixcms', 'sixcms/sixcms', 'search', 'suche', 'dataset?f', 'branchen', 'mobil.koeln.de', 'flag_content', 'comment', 'immobilien.koeln', 'stadtfuehrungen', 'stadtplan.html', 'koeln.de/kleinanzeigen', '/feedback/' , '/recommend/', 'termine.koeln', 'bildung.koeln', 'anwendungen.bielefeld', 'wahlomat', 'php/merkliste', '_druck=1', 'unt_tagung', 'buergerinfo.ulm.de', 'map.jsp') ROBOTSTXT_WHITELIST = ('/wahlen') AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_START_DELAY = 3.0 AUTOTHROTTLE_DEBUG = True DEPTH_PRIORITY = 1 SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue' SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' USER_AGENT = "Open_Data_Crawler/0.1 (+http://open-data-map.de)" ROBOTSTXT_BLACKLIST = dict() cur = metautils.getDBCursor(settings) cur.execute("SELECT url, crawl_blacklist FROM cities;") results = cur.fetchall() for result in results: ROBOTSTXT_BLACKLIST[result[0]] = result[1] #Key is city url, value is array of forbidden URL parts
import psycopg2 import metautils from dbsettings import settings metautils.setsettings(settings) print '\nMarking all Bonn Google data as rejected (needs to be changed if Google searches are ever resumed!' cur = metautils.getDBCursor(settings, dictCursor=True) cur.execute('update data set accepted = %s where city = %s and source = %s', (False, 'bonn', 'g')) metautils.dbCommit() print '\nResetting open...' cur = metautils.getDBCursor(settings, dictCursor=True) cur.execute('select url, licenseshort from data') for ores in cur.fetchall(): if ores['licenseshort'].strip() == '': license = 'nicht bekannt' open = None else: open = metautils.isopen(ores['licenseshort'].strip()) license = ores['licenseshort'].strip() cur.execute('update data set licenseshort = %s, open = %s where url = %s', (license, open, ores['url'])) metautils.dbCommit() print 'Finding cities with data...' cities = metautils.getCitiesWithData() print cities
import psycopg2, unicodecsv import metautils from dbsettings import settings metautils.setsettings(settings) csvfile = open('export.csv', 'wb') csvwriter = unicodecsv.writer(csvfile) cur = metautils.getDBCursor(settings, dictCursor = True) cur.execute('select distinct unnest(categories) as cat from data') columns = ['city', 'source', 'formats', 'license', 'filelist', 'num. files'] categories = [] for catrow in cur.fetchall(): categories.append(catrow['cat']) columns.extend(categories) csvwriter.writerow(columns) cur.execute('select city, source, formats, licenseshort, filelist, categories from data where accepted = %s', (True,)) for res in cur.fetchall(): row = [res['city'], res['source'], metautils.arraytocsv(res['formats']), res['licenseshort'], metautils.arraytocsv(res['filelist']), str(len(res['filelist']))] for el in categories: if el in res['categories']: row.append('x') else: row.append('')