Exemplo n.º 1
0
 def read_warc_archive(self, archive_path):
     with open(archive_path, 'rb') as stream:
         for record in ArchiveIterator(stream):
             if record.rec_type == 'response':
                 try:
                     parser = BeautifulSoup(record.content_stream().read(),
                                            features="html.parser")
                 except:
                     continue
                 links = parser.find_all("a")
                 if links:
                     for link in links:
                         href = link.attrs.get("href")
                         if href is not None:
                             if self.domain in href and href.startswith(
                                     "http"):
                                 path = urlparse(href).path
                                 domain_link = self.proper_domain + path
                                 self.data.append({
                                     '{0}_link'.format(self.domain_name):
                                     domain_link,
                                     'reference_link':
                                     record.rec_headers.get_header(
                                         'WARC-TARGET-URI'),
                                     'warc_date':
                                     dateutil.parser.parse(
                                         record.rec_headers.get_header(
                                             'WARC-Date'))
                                 })
Exemplo n.º 2
0
def get_timetable(origin_id, dest_id):
    today = datetime.now(tz=tz).date().isoformat()
    query_params = {
        "OriginStationId": origin_id,
        "OriginStationName": stations.by_id[origin_id],
        "DestStationId": dest_id,
        "DestStationName": stations.by_id[dest_id],
        "GoingTrainCln": today,
        "ReturnningTrainCln": today
    }
    query_params.update(base_query)
    session = requests.Session()
    session.get(base_uri, params=query_params)
    query_params[
        'isExcel'] = 'true'  # actually not excel, but a bare HTML file which is easier to parse
    # Doing two queries because the website is not stateless :(
    timetable_html = session.get(base_uri, params=query_params).text
    parser = BeautifulSoup(timetable_html, "lxml")
    timetable = parser.find_all('table')[
        -1]  # Last table in the page is the timetable
    out = []
    column_names = [
        'Train#', 'Platform', 'Departure', 'Arrival', 'Duration', 'Transfers',
        'extra', 'route_type'
    ]
    for row in timetable.find_all('tr'):
        out.append({
            col: cell.text.strip()
            for col, cell in zip(column_names, row.find_all('td'))
        })
    out.pop(0)  # remove column headers
    return (out)
    def process_record(self, record):
        if record.rec_type != 'response':
            return
        content_type = record.http_headers.get_header('content-type', None)
        if content_type is None or 'html' not in content_type:
            # skip non-HTML or unknown content types
            return

        try:
            parser = BeautifulSoup(record.content_stream().read(),
                                   features="html.parser")
        except:
            return

        links = parser.find_all("a")

        if links:
            for link in links:
                href = link.attrs.get("href")
                if href is not None:
                    if self.domain in href and href.startswith("http"):
                        path = urlparse(href).path
                        domain_link = self.proper_domain + path
                        if domain_link[-1] != '/':
                            domain_link += '/'
                        link_data = [{
                            '{0}_link'.format(self.domain_name):
                            domain_link,
                            'reference_link':
                            record.rec_headers.get_header('WARC-TARGET-URI'),
                            'warc_date':
                            dateutil.parser.parse(
                                record.rec_headers.get_header('WARC-Date'))
                        }]
                        yield link_data
Exemplo n.º 4
0
    def scrape_data(self, indicator, query_type):

        passive_table = []

        #search period 7 is "complete history"
        search_period = '7'

        # 0 = Current Day
        # 1 = Past 72 Hours
        # 2 = Past Week
        # 3 = Past Month
        # 4 = Past 3 Months
        # 5 = Past 6 Months
        # 6 = Past Year

        format = '0'
        # 0 = Display results on screen
        # 1 = Output to CSV file (Comma separated w/o quotes)
        # 2 = Output to CSV file (Comma separated with quotes)
        # 3 = Output to CSV file (Tab separated w/o quotes)
        # 4 = Output to CSV file (Tab separated with quotes)
        # 5 = Output to CSV file (Pipe separated w/o quotes)
        # 6 = Output to CSV file (Pipe separated with quotes)

        # queryType
        # A = Query IP Address or CIDR,
        # H = Query Hostname
        # X = Query Domain Name for Hosts
        # D = Query Domain for Authoritative Nameservers
        # N = Query Nameserver for Authoritative Domains

        url = "https://research.iad.internetidentity.com/index.php?period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query"

        self.browser.open(url)
        parser = self.browser.parsed

        for tr in parser.find_all('tr')[7:]:

            tds = []
            for td in tr.find_all('td'):
                tds.append(td.text.strip())

            # check that table data exists
            if len(tds) == 4:
                IID_seen = tds[0]
                IID_host = tds[1]
                IID_qType = tds[2]
                IID_ip = tds[3]

                passive_table.append({
                    'ip': IID_ip,
                    'domain': IID_host,
                    'date': IID_seen,
                    'ip_location': {}
                })

            tds[:] = []

        self.results.extend(passive_table)
Exemplo n.º 5
0
    def scrape_data(self, indicator, query_type):

        passive_table = []
        
        #search period 7 is "complete history"
        search_period = '7'

        # 0 = Current Day
        # 1 = Past 72 Hours
        # 2 = Past Week
        # 3 = Past Month
        # 4 = Past 3 Months
        # 5 = Past 6 Months
        # 6 = Past Year

        format = '0'
        # 0 = Display results on screen
        # 1 = Output to CSV file (Comma separated w/o quotes)
        # 2 = Output to CSV file (Comma separated with quotes)
        # 3 = Output to CSV file (Tab separated w/o quotes)
        # 4 = Output to CSV file (Tab separated with quotes)
        # 5 = Output to CSV file (Pipe separated w/o quotes)
        # 6 = Output to CSV file (Pipe separated with quotes)

        # queryType
        # A = Query IP Address or CIDR,
        # H = Query Hostname
        # X = Query Domain Name for Hosts
        # D = Query Domain for Authoritative Nameservers
        # N = Query Nameserver for Authoritative Domains

        url = "https://research.iad.internetidentity.com/index.php?period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query"

        self.browser.open(url)
        parser = self.browser.parsed

        for tr in parser.find_all('tr')[7:]:

            tds = []
            for td in tr.find_all('td'):
                tds.append(td.text.strip())

            # check that table data exists
            if len(tds) == 4:
                IID_seen = tds[0]
                IID_host = tds[1]
                IID_qType = tds[2]
                IID_ip = tds[3]

                passive_table.append({'ip': IID_ip, 'domain': IID_host, 'date': IID_seen, 'ip_location': {}})

            tds[:] = []

        self.results.extend(passive_table)
Exemplo n.º 6
0
    def check_cookie(self):

        url = "https://research.iad.internetidentity.com"
        self.browser.open(url, verify=False)
        parser = self.browser.parsed

        # Verify login succeeded
        login_test = parser.find_all('a', {'href': '/logout.php'})

        if login_test:
            return True

        return False
Exemplo n.º 7
0
    def check_cookie(self):

        url = "https://research.iad.internetidentity.com"
        self.browser.open(url)
        parser = self.browser.parsed

        # Verify login succeeded
        login_test = parser.find_all('a', {'href': '/logout.php'})

        if login_test:
            return True

        return False
Exemplo n.º 8
0
def get_timetable(origin_id, dest_id):
    today = datetime.now(tz=tz).date().isoformat()
    query_params = {"OriginStationId": origin_id,
                    "OriginStationName": stations.by_id[origin_id],
                    "DestStationId": dest_id,
                    "DestStationName": stations.by_id[dest_id],
                    "GoingTrainCln": today,
                    "ReturnningTrainCln": today}
    query_params.update(base_query)
    session = requests.Session()
    session.get(base_uri, params=query_params)
    query_params['isExcel'] = 'true'  # actually not excel, but a bare HTML file which is easier to parse
    # Doing two queries because the website is not stateless :(
    timetable_html = session.get(base_uri, params=query_params).text
    parser = BeautifulSoup(timetable_html, "lxml")
    timetable = parser.find_all('table')[-1]  # Last table in the page is the timetable
    out = []
    column_names = ['Train#', 'Platform', 'Departure', 'Arrival', 'Duration', 'Transfers', 'extra', 'route_type']
    for row in timetable.find_all('tr'):
        out.append({col: cell.text.strip() for col, cell in zip(column_names, row.find_all('td'))})
    out.pop(0)  # remove column headers
    return(out)
Exemplo n.º 9
0
    def scrape_data(self, indicator, query_type):

        passive_table = []
        search_period = '5'

        # 1 = Current day
        # 2 = Current month
        # 3 = Past 6 months
        # 4 = Past year
        # 5 = Full Historical

        format = '0'
        # 0 = Display results on screen
        # 1 = Output to CSV file (Comma separated w/o quotes)
        # 2 = Output to CSV file (Comma separated with quotes)
        # 3 = Output to CSV file (Tab separated w/o quotes)
        # 4 = Output to CSV file (Tab separated with quotes)
        # 5 = Output to CSV file (Pipe separated w/o quotes)
        # 6 = Output to CSV file (Pipe separated with quotes)

        # queryType
        # A = Query IP Address or CIDR,
        # H = Query Hostname
        # X = Query Domain Name for Hosts
        # D = Query Domain for Authoritative Nameservers
        # N = Query Nameserver for Authoritative Domains

        url = "https://research.iad.internetidentity.com/index.php?search_period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query"

        self.browser.open(url)
        parser = self.browser.parsed

        passive_row = []

        for tr in parser.find_all('tr')[7:]:

            tds = []
            for td in tr.find_all('td'):
                tds.append(td.text.strip())

            # check for querytype to correctly display output
            if query_type == 'A' or query_type == 'X':
                IID_ip = tds[0]
                IID_asn = tds[1]
                IID_bgp = tds[2]
                IID_seen = tds[3]
                IID_host = tds[4]

            else:
                IID_host = tds[0]
                IID_seen = tds[1]
                IID_ip = tds[2]
                IID_asn = tds[3]
                IID_bgp = tds[4]

            passive_row = [IID_seen, IID_ip, IID_host]
            passive_table.append(passive_row[:])
            passive_row[:] = []
            tds[:] = []

        self.results.extend(passive_table)