예제 #1
0
def main():
    metautils.setsettings(settings)

    cur = metautils.getDBCursor(settings)

    #Get cities to search
    cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s',
                (True, ))
    bres = cur.fetchall()

    print 'domain:incommon:new:binglost'

    for row in bres:
        citydata = find_data(row[0], row[1])
        citydict = {}
        for result in citydata:
            citydict[result['URL_Datei']] = result
        bingset = set(citydict.keys())
        allset = set(citydict.keys())
        cur = metautils.getDBCursor(settings)
        cur.execute('SELECT url FROM data WHERE source=%s AND city=%s',
                    ('b', row[0]))
        dbset = set()
        for dbres in cur.fetchall():
            dbset.add(dbres[0])
            allset.add(dbres[0])
        #Analysis
        intersection = dbset.intersection(bingset)
        dbnot = allset.difference(dbset)
        bingnot = allset.difference(bingset)

        records = []
        for urlkey in dbnot:
            therow = citydict[urlkey]
            #In this case, we can safely assign it directly
            therow['URL'] = therow['URL_Datei']
            #Likewise, there cannot be any filenames
            therow['filenames'] = []
            metautils.convert_crawl_row(therow, 'b')
            records.append(therow)

        print row[1] + ':' + str(len(intersection)) + ':' + str(
            len(dbnot)) + ':' + str(len(bingnot))
        #Write to DB
        metautils.addCrawlDataToDB(
            records)  #Checked and accepted are both false by default
def main():
    metautils.setsettings(settings)
    
    cur = metautils.getDBCursor(settings)
    
    #Get cities to search
    cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s', (True,))
    bres = cur.fetchall()
    
    print 'domain:incommon:new:binglost'
    
    for row in bres:
        citydata = find_data(row[0], row[1])
        citydict = {}
        for result in citydata:
            citydict[result['URL_Datei']] = result
        bingset = set(citydict.keys())
        allset = set(citydict.keys())
        cur = metautils.getDBCursor(settings)
        cur.execute('SELECT url FROM data WHERE source=%s AND city=%s', ('b', row[0]))
        dbset = set()
        for dbres in cur.fetchall():
            dbset.add(dbres[0])
            allset.add(dbres[0])
        #Analysis
        intersection = dbset.intersection(bingset)
        dbnot = allset.difference(dbset)
        bingnot = allset.difference(bingset)

        records = []
        for urlkey in dbnot:
            therow = citydict[urlkey]
            #In this case, we can safely assign it directly
            therow['URL'] = therow['URL_Datei']
            #Likewise, there cannot be any filenames
            therow['filenames'] = []
            metautils.convert_crawl_row(therow, 'b')
            records.append(therow)
            
        print row[1] + ':' + str(len(intersection)) + ':' + str(len(dbnot)) + ':' + str(len(bingnot))
        #Write to DB  
        metautils.addCrawlDataToDB(records) #Checked and accepted are both false by default
예제 #3
0
import metautils

from dbsettings import settings

SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
DOWNLOADER_MIDDLEWARES = {
    'dirbot.middleware.robotstxt.RobotsTxtMiddleware': 100,
    'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': None,
}
ROBOTSTXT_OBEY = True
GENERAL_BLACKLIST = ('.pdf', 'kontakt/', 'kontakt.', 'veranstaltungskalender', 'veranstaltungen', 'font=', 'print=', 'style=', 'font_size=')
#Old - not deleting as this needs to be transferred to the DB as far as possible
#ROBOTSTXT_BLACKLIST = ('urban.gera.de', 'extranet.iserlohn.de', 'timestamp=', 'textmodus=&textmodu', 'modul=druckansicht', 'switchtodate', 'dienstleistungen.php', 'siegen.de/vereinsregister', 'mitarbeiter/mitarbeiter.php', 'dienstleistungen/formular.php', 'events/list.php', 'lexikon/index.php', '.krebs/karte', 'cottbus.de/opt', 'cottbus.de/abfrage', 'sixcms/%20/sixcms', 'sixcms/sixcms', 'search', 'suche', 'dataset?f', 'branchen', 'mobil.koeln.de', 'flag_content', 'comment', 'immobilien.koeln', 'stadtfuehrungen', 'stadtplan.html', 'koeln.de/kleinanzeigen', '/feedback/' , '/recommend/', 'termine.koeln', 'bildung.koeln', 'anwendungen.bielefeld', 'wahlomat', 'php/merkliste', '_druck=1', 'unt_tagung', 'buergerinfo.ulm.de', 'map.jsp')
ROBOTSTXT_WHITELIST = ('/wahlen')
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 3.0
AUTOTHROTTLE_DEBUG = True
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
USER_AGENT = "Open_Data_Crawler/0.1 (+http://open-data-map.de)"

ROBOTSTXT_BLACKLIST = dict()

cur = metautils.getDBCursor(settings)
cur.execute("SELECT url, crawl_blacklist FROM cities;")
results = cur.fetchall()
for result in results:
    ROBOTSTXT_BLACKLIST[result[0]] = result[1] #Key is city url, value is array of forbidden URL parts
예제 #4
0
import psycopg2

import metautils

from dbsettings import settings

metautils.setsettings(settings)

print '\nMarking all Bonn Google data as rejected (needs to be changed if Google searches are ever resumed!'
cur = metautils.getDBCursor(settings, dictCursor=True)
cur.execute('update data set accepted = %s where city = %s and source = %s',
            (False, 'bonn', 'g'))
metautils.dbCommit()

print '\nResetting open...'
cur = metautils.getDBCursor(settings, dictCursor=True)
cur.execute('select url, licenseshort from data')
for ores in cur.fetchall():
    if ores['licenseshort'].strip() == '':
        license = 'nicht bekannt'
        open = None
    else:
        open = metautils.isopen(ores['licenseshort'].strip())
        license = ores['licenseshort'].strip()
    cur.execute('update data set licenseshort = %s, open = %s where url = %s',
                (license, open, ores['url']))
metautils.dbCommit()

print 'Finding cities with data...'
cities = metautils.getCitiesWithData()
print cities
예제 #5
0
import psycopg2, unicodecsv

import metautils

from dbsettings import settings

metautils.setsettings(settings)

csvfile = open('export.csv', 'wb')
csvwriter = unicodecsv.writer(csvfile)

cur = metautils.getDBCursor(settings, dictCursor = True)
cur.execute('select distinct unnest(categories) as cat from data')
columns = ['city', 'source', 'formats', 'license', 'filelist', 'num. files']
categories = []
for catrow in cur.fetchall():
    categories.append(catrow['cat'])
    
columns.extend(categories)

csvwriter.writerow(columns)

cur.execute('select city, source, formats, licenseshort, filelist, categories from data where accepted = %s', (True,))
for res in cur.fetchall():
    row = [res['city'], res['source'], metautils.arraytocsv(res['formats']), res['licenseshort'], metautils.arraytocsv(res['filelist']), str(len(res['filelist']))]
    for el in categories:
        if el in res['categories']:
            row.append('x')
        else:
            row.append('')