Exemplo n.º 1
0
def action_download(doc_list, docs):
    info('Document Download Started')
    i = 0
    download_list = []
    initial_count = 0
    print 'Gathering Live Documents For Metadata Mining\n'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows; U; Windows NT 6.0; pl; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB7.1 ( .NET CLR 3.5.30729',
        'Referer': 'https://www.google.co.uk/',
        'Accept-Language': 'en-US,en;q=0.5',
        'Cache-Control': 'no-cache'
    }
    for doc in doc_list:
        doc = doc.replace(' ', '%20')
        try:
            r = requests.get(doc.encode('utf-8'),
                             headers=headers,
                             verify=False)
            if r.status_code == 404:
                r.raise_for_status()

            if r.status_code == 200:
                params = cgi.parse_header(
                    r.headers.get('Content-Disposition', ''))[-1]
                if 'filename' not in params:
                    filename = str(doc).replace('%20', ' ').split('/')[-1]
                    with open(docs + filename, "w") as code:
                        i += 1
                        code.write(r.content)
                        code.close()
                        initial_count += 1
                        print('\tDownload Count: {}\r'.format(
                            str(initial_count))),
                        info(str(doc).replace('%20', ' '))
                        download_list.append(str(doc).replace('%20', ' '))

                    continue
                else:
                    filename_t = re.search('filename="(.*)"',
                                           r.headers['content-disposition'])
                    filename = filename_t.group(1)

                    with open(docs + filename, "w") as code:
                        i += 1
                        code.write(r.content)
                        code.close()
                        initial_count += 1
                        print('\tDownload Count: {}\r'.format(
                            str(initial_count))),
                        download_list.append(str(doc).replace('%20', ' '))
                        info(str(doc).replace('%20', ' '))
                    continue

        except ValueError:
            info('No Filename in header')
            pass
        except AttributeError:
            pass
        except IOError:
            info('Not Found: {}'.format(str(doc).replace('%20', ' ')))
            pass
        except requests.exceptions.HTTPError:
            info('Error: File Not Found Server Side: HTTPError')
            pass
        except requests.exceptions.ConnectionError:
            info('Error: File Not Found Server Side: ConnectionError')
            pass
        except KeyError:
            pass
        except UnboundLocalError:
            pass
        except Exception:
            info(
                'An Unhandled Exception Has Occured, Please Check The Log For Details\n'
                + INFO_LOG_FILE)
            info(str(doc).replace('%20', ' '))
            pass
    if i < 1:
        return download_list
    data_size = get_size(docs)
    print '\tData Downloaded: {}MB'.format(str(math.floor(data_size)))
    info('Documents Downloaded: {}'.format(initial_count))
    return download_list
Exemplo n.º 2
0
def action_download(doc_list, docs):
    info('Document Download Started')
    i = 0
    download_list = []
    initial_count = 0
    print '\nGathering Live Documents For Metadata Mining\n'
    for doc in doc_list:
        try:
            r = requests.get(doc)
            if r.status_code == 404:
                r.raise_for_status()
            if re.search('filename="(.*)"', r.headers['content-disposition']):
                filename_t = re.search('filename="(.*)"',
                                       r.headers['content-disposition'])
                filename = filename_t.group(1)
            else:
                continue
            with open(docs + filename, "w") as code:
                i += 1
                code.write(r.content)
                code.close()
                initial_count += 1
                print('\tDownload Count: {}\r'.format(str(initial_count))),
                download_list.append(doc)
                info(doc)

        except IOError:
            pass
        except requests.exceptions.HTTPError:
            info('Error: File Not Found Server Side')
            info(doc)
        except requests.exceptions.ConnectionError:
            info('Error: File Not Found Server Side')
            info(doc)
            continue
        except KeyError:
            temp = str(doc).rsplit('.', 1)[1]
            ext = re.sub(r'\?.*', r'', temp)
            filename = "file{}.{}".format(i, ext.replace('?T', ''))
            try:
                with open(docs + filename, "w") as code:
                    i += 1
                    code.write(r.content)
                    code.close()
                    initial_count += 1
                    print('\tDownload Count: {}\r'.format(str(initial_count))),
                    info(doc)
                    download_list.append(doc)

                continue
            except IOError:
                pass
            except Exception:
                info(
                    'An Unhandled Exception Has Occured, Please Check The Log For Details\n'
                    + INFO_LOG_FILE)
                info(doc)
                info(r.headers)
                continue
    if i < 1:
        sys.exit()
    data_size = get_size(docs)
    print '\n\nData Downloaded: {}MB'.format(str(math.floor(data_size)))
    info('Documents Downloaded: {}'.format(initial_count))
    return download_list