def checkWebsite():
    import requests
    logs_file = 'logs_website_invalid.csv'
    fcsv = open(logs_file, 'wb')
    writer = csv.writer(fcsv)
    writer.writerow(['id','name', 'error','url', 'finalurl'])
    idx = 1
    unilist = loadData('list_0423.csv')
    for uni in unilist[1:]:
        idx = idx + 1
        url = uni[4]
        if url == '':
            continue
        print url
        try:
            res = requests.get(url, timeout=20)
        except Exception, e:
            print e
            writer.writerow([uni[0], uni[1], 'exception', uni[4], ''])
        else:
            if res.status_code != 200:
                print res.status_code, res.history, res.url
                writer.writerow([uni[0], uni[1], 'badcode:' + str(res.status_code), uni[4], res.url.encode('utf-8')])
            if res.status_code == 200 and res.history:
                print res.status_code, res.history, res.url
                writer.writerow([uni[0], uni[1], 'redirect', uni[4], res.url.encode('utf-8')])
Пример #2
0
def checkWebsite():
    import requests
    logs_file = 'logs_website_invalid.csv'
    fcsv = open(logs_file, 'wb')
    writer = csv.writer(fcsv)
    writer.writerow(['id', 'name', 'error', 'url', 'finalurl'])
    idx = 1
    unilist = loadData('list_0423.csv')
    for uni in unilist[1:]:
        idx = idx + 1
        url = uni[4]
        if url == '':
            continue
        print url
        try:
            res = requests.get(url, timeout=20)
        except Exception, e:
            print e
            writer.writerow([uni[0], uni[1], 'exception', uni[4], ''])
        else:
            if res.status_code != 200:
                print res.status_code, res.history, res.url
                writer.writerow([
                    uni[0], uni[1], 'badcode:' + str(res.status_code), uni[4],
                    res.url.encode('utf-8')
                ])
            if res.status_code == 200 and res.history:
                print res.status_code, res.history, res.url
                writer.writerow([
                    uni[0], uni[1], 'redirect', uni[4],
                    res.url.encode('utf-8')
                ])
Пример #3
0
def checkEmblem():
    unilist = loadData('list_0423.csv')
    count = 0
    for uni in unilist[1:]:
        imgname = uni[0] + '.png'
        if not os.path.exists('emblem_0422/' + imgname):
            print imgname, uni[1]
            count = count + 1
    print count
    return
def checkEmblem():
    unilist = loadData('list_0423.csv')
    count = 0
    for uni in unilist[1:]:
        imgname = uni[0] + '.png'
        if not os.path.exists('emblem_0422/' + imgname):
            print imgname, uni[1]
            count = count + 1
    print count
    return
Пример #5
0
def correctWebsite():
    sitelist = []
    with open('logs_website_merge.csv', 'rb') as f:
        reader = csv.reader(f)
        sitelist = list(reader)[1:]
    sitemap = {}
    for s in sitelist:
        sitemap[s[0]] = s

    unilist = loadData('list_0425_2.csv')
    writer = csv.writer(open('list_0425_3.csv', 'wb'))
    writer.writerow(unilist[0])
    count = 0
    for uni in unilist[1:]:
        id = uni[0]
        if id in sitemap and sitemap[id][-1] != '':
            count = count + 1
            uni[4] = sitemap[id][-1]
        writer.writerow(uni)
    print count
Пример #6
0
def googleCrawler():
    from akaparser import loadData
    unilist = loadData('list_0423.csv')
    # uniname = 'Universidad Torcuato di Tella'
    # uniname = 'Universidad Nacional de Cordoba'
    count = 0
    for uni in unilist[1:]:
        img = uni[0] + '.png'
        if os.path.exists('emblem_0413/' + img):
            continue
        if os.path.exists('html_google_2/' + uni[0] + '.html'):
            continue
        uniname = uni[1]
        # print uni
        count = count + 1
        print count
        scrapeGoogle(uni[0], uniname)
        # break
    # print count
    return
def correctWebsite():
    sitelist = []
    with open('logs_website_merge.csv', 'rb') as f:
        reader = csv.reader(f)
        sitelist = list(reader)[1:]
    sitemap = {}
    for s in sitelist:
        sitemap[s[0]] = s

    unilist = loadData('list_0425_2.csv')
    writer = csv.writer(open('list_0425_3.csv', 'wb'))
    writer.writerow(unilist[0])
    count = 0
    for uni in unilist[1:]:
        id = uni[0]
        if id in sitemap and sitemap[id][-1] != '':
            count = count + 1
            uni[4] = sitemap[id][-1]
        writer.writerow(uni)
    print count
def googleCrawler():
    from akaparser import loadData
    unilist = loadData('list_0423.csv')
    # uniname = 'Universidad Torcuato di Tella'
    # uniname = 'Universidad Nacional de Cordoba'
    count = 0
    for uni in unilist[1:]:
        img = uni[0] + '.png'
        if os.path.exists('emblem_0413/' + img):
            continue
        if os.path.exists('html_google_2/' + uni[0] + '.html'):
            continue
        uniname = uni[1]
        # print uni
        count = count + 1
        print count
        scrapeGoogle(uni[0], uniname)
        # break
    # print count
    return