Python UserAgent示例，pyetherchain.pyetherchain.UserAgent Python示例

示例#1

0

显示文件

文件： downloadSCs.py 项目： AlekseiSmirnov/solidity-metrics

 def __init__(self, proxies={}):
     self.config = configparser.ConfigParser()
     self.config.read('config.ini')
     self.session = UserAgent(baseurl="https://etherscan.io",
                              retry=5,
                              retrydelay=8,
                              proxies=proxies)
     self.ec = EtherChain()
     self.soup = None

示例#2

0

显示文件

文件： download_contracts_etherscan_io.py 项目： imbinatas/smart-contract-sanctuary

 def __init__(self, baseurl=None, proxies={}):
     baseurl = baseurl or "https://www.etherscan.io"
     self.session = UserAgent(baseurl=baseurl,
                              retry=5,
                              retrydelay=8,
                              proxies=proxies)

示例#3

0

显示文件

文件： download_contracts_etherscan_io.py 项目： imbinatas/smart-contract-sanctuary

class EtherScanIoApi(object):
    """
    Base EtherScan.io Api implementation
    """
    def __init__(self, baseurl=None, proxies={}):
        baseurl = baseurl or "https://www.etherscan.io"
        self.session = UserAgent(baseurl=baseurl,
                                 retry=5,
                                 retrydelay=8,
                                 proxies=proxies)

    def get_contracts(self, start=0, end=None):
        page = start

        while not end or page <= end:
            for _ in range(5):
                resp = self.session.get("/contractsVerified/%d?ps=100" %
                                        page).text
                pageResult = re.findall(
                    r'Page <strong(?:[^>]+)>(\d+)</strong> of <strong(?:[^>]+)>(\d+)</strong>',
                    resp)
                if len(pageResult) > 0:
                    break
                time.sleep(10)  # wait a bit ;)
            page, lastpage = pageResult[0]
            page, lastpage = int(page), int(lastpage)
            if not end:
                end = lastpage
            rows = self._parse_tbodies(resp)[0]  # only use first tbody
            for col in rows:

                contract = {
                    'address':
                    self._extract_text_from_html(col[0]).split(" ", 1)[0],
                    'name':
                    self._extract_text_from_html(col[1]),
                    'compiler':
                    self._extract_text_from_html(col[3]),
                    'balance':
                    self._extract_text_from_html(col[4]),
                    'txcount':
                    int(self._extract_text_from_html(col[5])),
                    'settings':
                    self._extract_text_from_html(col[6]),
                    'date':
                    self._extract_text_from_html(col[7]),
                }
                yield contract
            page += 1

    def get_contract_source(self, address):
        import time
        e = None
        for _ in range(20):
            resp = self.session.get("/address/%s" % address).text
            if "You have reached your maximum request limit for this resource. Please try again later" in resp:
                print("[[THROTTELING]]")
                time.sleep(1 + 2.5 * _)
                continue
            try:
                print(
                    "=======================================================")
                print(address)
                #print(resp)
                sources = []
                # remove settings box. this is not solidity source
                if "<span class='text-secondary'>Settings</span><pre class='js-sourcecopyarea editor' id='editor' style='margin-top: 5px;'>" in resp:
                    resp = resp.split(
                        "<span class='text-secondary'>Settings</span><pre class='js-sourcecopyarea editor' id='editor' style='margin-top: 5px;'>",
                        1)[0]

                for rawSource in re.split(
                        "<pre class='js-sourcecopyarea editor' id='editor\d*' style='margin-top: 5px;'>",
                        resp)[1:]:
                    src = rawSource.split("</pre><br>", 1)[0]
                    soup = BeautifulSoup(src)
                    source = soup.get_text()  # normalize html.
                    if DEBUG_PRINT_CONTRACTS:
                        print(source)
                    if "&lt;" in source or "&gt;" in source or "&le;" in source or "&ge;" in source or "&amp;" in source or "&vert;" in source or "&quot;" in source:
                        raise Exception(
                            "HTML IN OUTPUT!! - BeautifulSoup messed up..")
                    source = source.replace("&lt;", "<").replace(
                        "&gt;", ">").replace("&le;", "<=").replace(
                            "&ge;", ">=").replace("&amp;", "&").replace(
                                "&vert;", "|").replace("&quot;", '"')
                    sources.append(source)
                if not sources:
                    raise Exception(
                        "unable to find source-code. rate limited? retry..")
                return "\n\n".join(sources)
            except Exception as e:
                print(e)
                time.sleep(1 + 2.5 * _)
                continue
        raise e

    def _extract_text_from_html(self, s):
        return re.sub('<[^<]+?>', '', s).strip()
        # return ''.join(re.findall(r">(.+?)</", s)) if ">" in s and "</" in s else s

    def _extract_hexstr_from_html_attrib(self, s):
        return ''.join(re.findall(r".+/([^']+)'",
                                  s)) if ">" in s and "</" in s else s

    def _get_pageable_data(self, path, start=0, length=10):
        params = {
            "start": start,
            "length": length,
        }
        resp = self.session.get(path, params=params).json()
        # cleanup HTML from response
        for item in resp['data']:
            keys = item.keys()
            for san_k in set(keys).intersection(
                    set(("account", "blocknumber", "type", "direction"))):
                item[san_k] = self._extract_text_from_html(item[san_k])
            for san_k in set(keys).intersection(
                ("parenthash", "from", "to", "address")):
                item[san_k] = self._extract_hexstr_from_html_attrib(
                    item[san_k])
        return resp

    def _parse_tbodies(self, data):
        tbodies = []
        for tbody in re.findall(r"<tbody.*?>(.+?)</tbody>", data, re.DOTALL):
            #print(tbody)
            rows = []
            for tr in re.findall(r"<tr.*?>(.+?)</tr>", tbody):
                rows.append(re.findall(r"<td.*?>(.+?)</td>", tr))
            tbodies.append(rows)
        return tbodies

示例#4

0

显示文件

文件： downloadSCs.py 项目： AlekseiSmirnov/solidity-metrics

class EtherScanIoApi(object):
    """
    Base EtherScan.io Api implementation
    TODO:
    - implement a script (client) that runs all the python script
    - fix the issue about SC with several classes. The issue is at 03 script
    - Fix the issue about solmet, for some address the tool is not able to get statistic at 02 and it brokes 03
    - fix _get_contract_name
    """
    def __init__(self, proxies={}):
        self.config = configparser.ConfigParser()
        self.config.read('config.ini')
        self.session = UserAgent(baseurl="https://etherscan.io",
                                 retry=5,
                                 retrydelay=8,
                                 proxies=proxies)
        self.ec = EtherChain()
        self.soup = None

    def get_contracts_from_block(self, block):
        soup = BeautifulSoup(requests.get('https://etherscan.io/txs?block=' +
                                          str(block)).text,
                             features="html.parser")
        addresses = soup.select("i[title='Contract']")
        for address in list(
                set(
                    map(
                        lambda x: x.findNext('a')['href'].replace(
                            '/address/', ''), addresses))):
            if not self._is_new_address(address):
                continue
            describe_contract = self.ec.account(address).describe_contract
            self._set_soup(address)
            contract = {
                'address': address,
                'name': self._get_contract_name(),
                'compiler': None,
                'compiler_version': self._get_compiler_version(),
                'balance': describe_contract.__self__['balance'],
                'txcount': describe_contract.__self__['txreceived'],
                'firstseen': describe_contract.__self__['firstseen'],
                'lastseen': describe_contract.__self__['lastseen']
            }
            yield contract

    def get_contracts_from_etherscan(self, start=0, end=None):
        page = start

        while not end or page <= end:
            resp = self.session.get("/contractsVerified/%d" % page).text
            page, lastpage = re.findall(
                r'Page <.*>(\d+)</.*> of <.*>(\d+)</.*>', resp)[0]
            page, lastpage = int(page), int(lastpage)
            if not end:
                end = lastpage
            rows = self._parse_tbodies(resp)[0]  # only use first tbody
            for col in rows:
                address = self._extract_text_from_html(col[0]).split(" ", 1)[0]
                if not self._is_new_address(address):
                    continue
                describe_contract = self.ec.account(address).describe_contract
                firstseen = describe_contract.__self__['firstseen']
                lastseen = describe_contract.__self__['lastseen']
                contract = {
                    'address':
                    address,
                    'name':
                    self._extract_text_from_html(col[1]),
                    'compiler':
                    self._extract_text_from_html(col[2]),
                    'compiler_version':
                    self._extract_text_from_html(col[3]),
                    'balance':
                    self._get_balance(self._extract_text_from_html(col[4])),
                    'txcount':
                    self._extract_text_from_html(col[5]),
                    'firstseen':
                    firstseen,
                    'lastseen':
                    firstseen
                }
                yield contract
            page += 1

    def write_etherChain_fn(self, contracts=[]):
        amount = 100
        for nr, c in enumerate(contracts):
            with open(self.config['DEFAULT']['etherChain_fn'], 'a+') as f:
                print("got contract: %s" % c)

                f_path = os.path.join(self.config['DEFAULT']['output_path'],
                                      '%s.sol' % (c["address"]))
                try:
                    source = self._get_contract_source(c["address"]).strip()
                    if not len(source):
                        raise Exception(c)
                except Exception as e:
                    continue

                f.write("%s\n" % c)
                with open(f_path, "wb") as f:
                    f.write(bytes(source, "utf8"))

                print("[%d/%d] dumped --> %s (%-20s) -> %s" %
                      (nr, amount, c["address"], c["name"], f_path))

                nr += 1
                if nr >= amount:
                    break

    def _get_contract_source(self, address):
        import time
        e = None
        for _ in range(5):
            resp = self.session.get("/address/%s" % address).text
            print("/address/%s" % address)
            if "You have reached your maximum request limit for this resource. Please try again later" in resp:
                print("[[THROTTELING]]")
                time.sleep(1 + 2.5 * _)
                continue
            try:
                print(
                    "=======================================================")
                print(address)
                resp = resp.split(
                    "<pre class='js-sourcecopyarea editor' id='editor' style='margin-top: 5px;'>",
                    1)[1]
                resp = resp.split("</pre><br>", 1)[0]
                return resp.replace("&lt;", "<").replace("&gt;", ">").replace(
                    "&le;",
                    "<=").replace("&ge;",
                                  ">=").replace("&amp;",
                                                "&").replace("&vert;", "|")
            except:
                print(traceback.format_exc())
                time.sleep(1 + 2.5 * _)
                break

    def _is_new_address(self, address):
        if (address not in open(self.config['DEFAULT']['smec_fn']).read()):
            return True
        return False

    def _set_soup(self, address):
        url = address.join(['https://etherscan.io/address/', '#code'])
        self.soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    def _get_compiler_version(self):
        try:
            str = self.soup.findAll('span',
                                    text=re.compile('v0.'))[0].contents[0]
            return re.search('v(\d{1,2}.\d{1,2}.\d{1,2})', str)[1]
        except IndexError:
            return None

    def _get_contract_name(self):
        try:
            return self.soup.find(lambda tag: tag.name == "span" and "Name" in
                                  tag.text).parent.find_next(
                                      'td').contents[0].strip()
        except:
            return None

    def _get_addresses_from_fn(self, fn):
        try:
            fp = open(fn)
            return list(filter(None, map(lambda x: x.strip(), fp.readlines())))
        finally:
            fp.close()

    def _extract_text_from_html(self, s):
        return re.sub('<[^<]+?>', '', s).strip()

    def _extract_hexstr_from_html_attrib(self, s):
        return ''.join(re.findall(r".+/([^']+)'",
                                  s)) if ">" in s and "</" in s else s

    def _get_balance(self, balance):
        try:
            return int(re.sub('[a-zA-Z]', '', balance))
        except ValueError:
            return None

    def _get_pageable_data(self, path, start=0, length=10):
        params = {
            "start": start,
            "length": length,
        }
        resp = self.session.get(path, params=params).json()
        # cleanup HTML from response
        for item in resp['data']:
            keys = item.keys()
            for san_k in set(keys).intersection(
                    set(("account", "blocknumber", "type", "direction"))):
                item[san_k] = self._extract_text_from_html(item[san_k])
            for san_k in set(keys).intersection(
                ("parenthash", "from", "to", "address")):
                item[san_k] = self._extract_hexstr_from_html_attrib(
                    item[san_k])
        return resp

    def _parse_tbodies(self, data):
        tbodies = []
        for tbody in re.findall(r"<tbody.*?>(.+?)</tbody>", data, re.DOTALL):
            rows = []
            for tr in re.findall(r"<tr.*?>(.+?)</tr>", tbody):
                rows.append(re.findall(r"<td.*?>(.+?)</td>", tr))
            tbodies.append(rows)
        return tbodies

示例#5

0

显示文件

 def __init__(self, proxies={}):
     self.session = UserAgent(baseurl="https://etherscan.io",
                              retry=5,
                              retrydelay=8,
                              proxies=proxies)

示例#6

0

显示文件

class EtherScanIoApi(object):
    """
    Base EtherScan.io Api implementation
    """
    def __init__(self, proxies={}):
        self.session = UserAgent(baseurl="https://etherscan.io",
                                 retry=5,
                                 retrydelay=8,
                                 proxies=proxies)

    def get_contracts(self, start=0, end=None):
        page = start

        while not end or page <= end:
            resp = self.session.get("/contractsVerified/%d" % page).text
            page, lastpage = re.findall(r'Page <b>(\d+)</b> of <b>(\d+)</b>',
                                        resp)[0]
            page, lastpage = int(page), int(lastpage)
            if not end:
                end = lastpage
            rows = self._parse_tbodies(resp)[0]  # only use first tbody
            for col in rows:
                contract = {
                    'address':
                    self._extract_text_from_html(col[0]).split(" ", 1)[0],
                    'name':
                    self._extract_text_from_html(col[1]),
                    'compiler':
                    self._extract_text_from_html(col[2]),
                    'balance':
                    self._extract_text_from_html(col[3]),
                    'txcount':
                    int(self._extract_text_from_html(col[4])),
                    'settings':
                    self._extract_text_from_html(col[5]),
                    'date':
                    self._extract_text_from_html(col[6]),
                }
                yield contract
            page += 1

    def get_contract_source(self, address):
        import time
        e = None
        for _ in range(20):
            resp = self.session.get("/address/%s" % address).text
            if "You have reached your maximum request limit for this resource. Please try again later" in resp:
                print("[[THROTTELING]]")
                time.sleep(1 + 2.5 * _)
                continue
            try:
                print(
                    "=======================================================")
                print(address)
                #print(resp)
                resp = resp.split(
                    "</span><pre class='js-sourcecopyarea' id='editor' style='margin-top: 5px;'>",
                    1)[1]
                resp = resp.split("</pre><br>", 1)[0]
                return resp.replace("&lt;", "<").replace("&gt;", ">").replace(
                    "&le;",
                    "<=").replace("&ge;",
                                  ">=").replace("&amp;",
                                                "&").replace("&vert;", "|")
            except Exception as e:
                print(e)
                time.sleep(1 + 2.5 * _)
                continue
        raise e

    def _extract_text_from_html(self, s):
        return re.sub('<[^<]+?>', '', s).strip()
        # return ''.join(re.findall(r">(.+?)</", s)) if ">" in s and "</" in s else s

    def _extract_hexstr_from_html_attrib(self, s):
        return ''.join(re.findall(r".+/([^']+)'",
                                  s)) if ">" in s and "</" in s else s

    def _get_pageable_data(self, path, start=0, length=10):
        params = {
            "start": start,
            "length": length,
        }
        resp = self.session.get(path, params=params).json()
        # cleanup HTML from response
        for item in resp['data']:
            keys = item.keys()
            for san_k in set(keys).intersection(
                    set(("account", "blocknumber", "type", "direction"))):
                item[san_k] = self._extract_text_from_html(item[san_k])
            for san_k in set(keys).intersection(
                ("parenthash", "from", "to", "address")):
                item[san_k] = self._extract_hexstr_from_html_attrib(
                    item[san_k])
        return resp

    def _parse_tbodies(self, data):
        tbodies = []
        for tbody in re.findall(r"<tbody.*?>(.+?)</tbody>", data, re.DOTALL):
            print(tbody)
            rows = []
            for tr in re.findall(r"<tr.*?>(.+?)</tr>", tbody):
                rows.append(re.findall(r"<td.*?>(.+?)</td>", tr))
            tbodies.append(rows)
        return tbodies