예제 #1
0
def tabelog(url: str) -> Dict[str, str]:
    result_dict: Dict[str, str] = {}

    soup = U.fetch_url(url)
    n_rst_pages = _get_total_page_num(soup)
    page_number = random.choice(n_rst_pages) + 1

    soup = U.fetch_one_page(url, page_number)
    for _ in range(C.MAX_RETRY):
        try:
            rst_url, name = _random_choice_rst(soup)
            result_dict["rst_url"] = rst_url
            rst_url = urllib.parse.urljoin(rst_url,
                                           C.PHOTO_SUBDIR_TABELOG) + "/"
            result_dict["rst_name"] = name

            img_soup = U.fetch_one_page(rst_url, 1)
            n_img_pages = _get_total_page_num(img_soup)
            page_number = random.choice(n_img_pages) + 1

            img_page_url = rst_url + "1/smp0/D-normal/" + str(page_number)
            img_soup = U.fetch_url(img_page_url)
            img_url, title = _random_choice_img(img_soup)
            break
        except urllib.error.HTTPError:
            continue
    result_dict["img_name"] = title
    result_dict["img_url"] = img_url
    return result_dict
예제 #2
0
def main():
    html1 = utils.fetch_url(url_1)
    html2 = utils.fetch_url(url_2)
    html3 = utils.fetch_url(url_3)
    html4 = utils.fetch_url(url_4)
    process1(html1)
    process2(html2)
    process3(html3)
    process3(html4)
    save(txt_filename, url_list)
예제 #3
0
def get_title(url):
    if not is_valid_url(url):
        return

    host = urlparse(url).netloc

    try:
        ip = socket.gethostbyname(host)
    except:
        return

    if is_not_public(ip):
        return

    data = fetch_url(url)
    page = data.read(4096)

    if page == '':
        return

    title_match = title_regex.search(page)
    if title_match:
        title = title_match.group(1)

    title = title.strip().replace('\n', '')
    return unescape(title)
예제 #4
0
파일: extractor.py 프로젝트: flaresky/stock
def extract_stock2(code):
    fields = {
              'sjl' : '592920',
              'zsz' : '3541450',
              'ltsz' : '3475914',
              }
    cnx = mysqllib.get_connection()
    cursor = cnx.cursor()
    url = "http://d.10jqka.com.cn/v2/realhead/hs_%s/last.js"%(code)
    data = utils.fetch_url(url)
    data = re.sub(r'quotebridge.*?\((.*)\)', r'\1', data)
    jo = json.loads(data)['items']
    if jo is not None:
        try:
            jo['3541450'] = "%.2f"%(float(jo['3541450']) / 100000000)
            jo['3475914'] = "%.2f"%(float(jo['3475914']) / 100000000)
            keys = fields.keys()
            vals = ["'"+ (jo[fields[k]] or '')+"'" for k in keys]
            keys.append('code')
            vals.append("'%s'"%(code))
            updates = [keys[i]+"="+vals[i] for i in range(0, len(keys))]
        except:
            utils.print_with_time("url=%s"%(url))
            traceback.print_exc()
            return
         
        sql = "INSERT INTO stock (%s) VALUES (%s) ON DUPLICATE KEY UPDATE %s"%(', '.join(keys), ', '.join(vals), ', '.join(updates))
#         print sql
        cursor.execute(sql)
        cnx.commit()
    cursor.close()
    cnx.close()
 def search_results(self):
     url = self.context.getRemoteUrl()
     search_term = urllib.quote_plus(self.searchterm)
     if self.has_searchterm():
         if not search_term:
                 return []
         else:
             qurl = substitute_parameters(url, self.request.form)
     else:
         qurl = url
     rd = fetch_url(qurl)
     results = rd['result']
     if rd['type'] == 'feed':
         try:
             self.total_results = int(results.feed.get('opensearch_totalresults','0'))
             if self.total_results == 0:
                 self.total_results = int(results.feed.get('totalresults','0'))
         except ValueError:
             pass
         for link in results.feed.get('links', []):
             if (link['rel']=='alternate') and (link['type']=='text/html'):
                 self.feed_html_link = link['href']
         self.feed_title = results.feed.get('title', '')
         return results['entries']
     elif rd['type'] == 'kml':
         return parse_kml(results)
     else:
         return []
예제 #6
0
 def search_results(self):
     url = self.context.getRemoteUrl()
     search_term = urllib.quote_plus(self.searchterm)
     if self.has_searchterm():
         if not search_term:
             return []
         else:
             qurl = substitute_parameters(url, self.request.form)
     else:
         qurl = url
     rd = fetch_url(qurl)
     results = rd['result']
     if rd['type'] == 'feed':
         try:
             self.total_results = int(
                 results.feed.get('opensearch_totalresults', '0'))
             if self.total_results == 0:
                 self.total_results = int(
                     results.feed.get('totalresults', '0'))
         except ValueError:
             pass
         for link in results.feed.get('links', []):
             if (link['rel'] == 'alternate') and (link['type']
                                                  == 'text/html'):
                 self.feed_html_link = link['href']
         self.feed_title = results.feed.get('title', '')
         return results['entries']
     elif rd['type'] == 'kml':
         return parse_kml(results)
     else:
         return []
예제 #7
0
    def _process_image(self, img):
        pic = pq(img).attr('src')

        # Attachment
        if pic == 'static/image/common/none.gif':
            pic = 'http://www.lightnovel.cn/{}'.format(pq(img).attr('file'))

        if pic.startswith('http'):
            # Resize/divide image if necessary
            try:
                image_buffer = StringIO(fetch_url(pic))
                image = Image.open(image_buffer)
                # Grayscale size saving too little
                #image = ImageOps.grayscale(image)
                if image.size[0] > image.size[1]:
                    image = image.rotate(90)
                image.thumbnail((600, 800), Image.ANTIALIAS)

                filename = self._add_image(image)
                pq(img).attr('src', filename)
                pq(img).attr('width', str(image.size[0]))
                pq(img).attr('height', str(image.size[1]))
                image_buffer.close()
            except HTTPError:
                print 'Cannot find image: {}'.format(pic)
예제 #8
0
 def _turn_page(self):
     parts = self.url.split('-')
     parts[2] = str(self.next_page)
     url = '-'.join(parts)
     self.page = fetch_url(url).decode('gbk', 'replace')
     self.d = pq(self.page)
     self.next_page += 1
예제 #9
0
def get_current_streams():
    """Returns a list of streams (with only relevant keys)"""
    streams = list()
    afreeca_url = 'http://live.afreecatv.com:8057/afreeca/broad_list_api.php'
    # afreeca_url = 'http://localhost:8000/broad_list_api.php'
    afreeca_response = utils.fetch_url(afreeca_url)
    afreeca_json_str = format_afreeca_response_to_json(afreeca_response)

    json_object = json.loads(afreeca_json_str)
    time_format = '%Y-%m-%d %H:%M'
    time_offset = 9
    for info in json_object['CHANNEL']['REAL_BROAD']:
        id = info['user_id']
        viewers = int(info['total_view_cnt'])
        locked = info['is_password'] == 'Y'
        online_since = utils.get_utc_time(info['broad_start'], time_format,
                                          time_offset)
        image = info['broad_img']
        stream = {
            'type': 'afreeca',
            'id': id,
            'viewers': viewers,
            'online_since': online_since,
            'image': image,
            'locked': locked
        }
        streams.append(stream)
    return streams
예제 #10
0
파일: title.py 프로젝트: ctrlcctrlv/raisin
def get_title(url):
    if not is_valid_url(url):
        return
    
    host = urlparse(url).netloc

    try:
        ip = socket.gethostbyname(host)
    except:
        return

    if is_not_public(ip):
        return
    
    data = fetch_url(url)
    page = data.read(4096)

    if page == '':
        return
    
    title_match = title_regex.search(page)
    if title_match:
        title = title_match.group(1)
    
    title = title.strip().replace('\n', '')
    return unescape(title)
예제 #11
0
def hdp_albums():
    return [{
        'label': r['title'],
        'path': 'plugin://script.module.hdpparser?uri=' + quote_plus(r['url']),
        'thumbnail': r.get('thumb'),
    } for r in json.loads(fetch_url('http://xbmc.hdpfans.com/albums.json'))
    ]
예제 #12
0
def hdp_albums():
    return [{
        'label': r['title'],
        'path': 'plugin://script.module.hdpparser?uri=' + quote_plus(r['url']),
        'thumbnail': r.get('thumb'),
    } for r in json.loads(fetch_url('http://xbmc.hdpfans.com/albums.json'))
    ]
예제 #13
0
    def get_uk(self):
        url = "http://pan.baidu.com/share/manage"
        content = fetch_url(url, headers={"Cookie": "BDUSS=" + self._bduss})
        _RE = re.compile(r'<a class="homepagelink" href="http://pan.baidu.com/share/home\?uk=(\d+)"')  # noqa
        uk = int(_RE.search(content).group(1))

        return uk
예제 #14
0
def main():
    for i in range(1, 11):
        url = 'https://pente.koro-pokemon.com/data/waza-list-{0}.shtml'.format(
            i)
        html = utils.fetch_url(url)
        process(html)

    utils.save(json_filename, new_moves_list)
예제 #15
0
    def _fetch_clientapi(self, url, data=None, headers={}, need_auth=True):
        if need_auth and self._bduss:
            headers['Cookie'] = 'BDUSS=' + self._bduss
        content = fetch_url(url, data, headers)
        r = json.loads(content)
        if r.get('errno', 0) or r.get('error_code', 0):
            raise ClientApiError(r)

        return r
예제 #16
0
파일: addon.py 프로젝트: HeddaZ/kodi-addons
def hdp_albums():
    return [
        {
            "label": r["title"],
            "path": "plugin://script.module.hdpparser?uri=" + quote_plus(r["url"]),
            "thumbnail": r.get("thumb"),
        }
        for r in json.loads(fetch_url("http://xbmc.hdpfans.com/albums.json"))
    ]
예제 #17
0
파일: znds.py 프로젝트: peiit/kodi_plugins
def guess():
    items = generate_items_from_page(fetch_url(urljoin(HOST, '/baidu.php')))

    items.append({
        'label': '换一批',
        'path': m.url_for('guess'),
    })

    return m.plugin.finish(items, view_mode='thumbnail')
예제 #18
0
    def get_uk(self):
        url = 'http://pan.baidu.com/share/manage'
        content = fetch_url(url, headers={'Cookie': 'BDUSS=' + self._bduss})
        _RE = re.compile(
            r'<a class="homepagelink" href="http://pan.baidu.com/share/home\?uk=(\d+)"'
        )  # noqa
        uk = int(_RE.search(content).group(1))

        return uk
예제 #19
0
def random_pair():
    host = 'http://es.wikipedia.org/w/api.php?'
    parameters = 'format=json&action=query&list=random&rnnamespace=0&rnlimit=2'
    url = host + parameters

    json_file = fetch_url(url)
    page_dictionary = json.load(json_file)
    
    return page_dictionary['query']['random'][0]['title'].encode('utf-8'), page_dictionary['query']['random'][1]['title'].encode('utf-8') 
예제 #20
0
    def _fetch_clientapi(self, url, data=None, headers={}, need_auth=True):
        if need_auth and self._bduss:
            headers["Cookie"] = "BDUSS=" + self._bduss
        content = fetch_url(url, data, headers)
        r = json.loads(content)
        if r.get("errno", 0) or r.get("error_code", 0):
            raise ClientApiError(r)

        return r
예제 #21
0
    def archive_character_data(self, graph):
        """
        Fetch data for the nodes in the graph. Only including those which are online.

        """
        nodes = graph.nodes()

        # Find only those characters who are not "OLD"
        # old_id = single_column(self.database_connection,
        #                        f"SELECT character_id FROM {self.table_name}_character_info WHERE last_login_date < ?",
        #                        (self.tSinceLogin,))
        # remaining_nodes = [n for n in nodes if n not in old_id]
        remaining_nodes = nodes
        # Gets character attributes for each found in the friend lists
        archive_id = single_column(
            self.archive_connection,
            f"SELECT character_id FROM {self.table_name}_node ")
        remaining_nodes = [n for n in remaining_nodes if n not in archive_id]
        re_count = len(remaining_nodes)
        fetch_logger().info(
            f"Number of nodes in graph is: {len(nodes)} Number of unarchived nodes is: {re_count}"
        )
        # Break the list up into chunks of 40
        smallLists = chunks(remaining_nodes, CHARACTER_INFO_BATCH_SIZE)

        completed_jobs = 0
        for character_id_batch in smallLists:

            character_ids = ",".join(character_id_batch)
            url = f"http://census.daybreakgames.com/s:{SERVICE_ID}/get/" \
                  f"{self.namespace}/character/?character_id={character_ids}" \
                  f"&c:resolve=outfit,name,stats,times,stat_history"

            fetch_logger().debug(f'fetching {url}')
            decoded = fetch_url(url)

            results = decoded["character_list"]
            for result in results:
                # Unpack the server response and add each to the archive_connection.
                try:
                    self.archive_connection.execute(
                        f"INSERT OR REPLACE into {self.table_name}_node (character_id,raw) VALUES(?,?)",
                        (result["character_id"], json.dumps(result)),
                    )
                except Exception:
                    fetch_logger().info("archive_connection failure")
                    if "error" in str(decoded):
                        fetch_logger().info("Server down")
                        exit(1)
                    else:
                        raise
            self.archive_connection.commit()
            completed_jobs += len(character_id_batch)
            fetch_logger().info(
                f"looking up data completion is at {(completed_jobs / re_count) * 100.0} percent"
            )
예제 #22
0
def scrap2(doi):
    base_url = "http://libgen.lc/scimag/ads.php?doi="
    html = utils.fetch_url(base_url + doi)
    for fila in html.split("\n"):
        try:
            clave = fila.index("http://booksdl.org/scimag/get.php?doi=" + doi)
            url = fila[clave:fila.index('"', clave)]
            return url
        except:
            pass
예제 #23
0
def random_pair():
    global language
    host = 'http://%s.wikipedia.org/w/api.php?' % language
    parameters = 'format=json&action=query&list=random&rnnamespace=0&rnlimit=2'
    url = host + parameters

    json_file = fetch_url(url)
    page_dictionary = json.load(json_file)
    
    return [page_dictionary['query']['random'][i]['title'].encode('utf-8') for i in (0, 1)]
예제 #24
0
파일: znds.py 프로젝트: HeddaZ/kodi-addons
def search_result(keyword):
    return m.plugin.finish(
        generate_items_from_page(
            fetch_url(
                HOST + 'if.php',
                'q=' + quote_plus(keyword)
            )
        ),
        view_mode='thumbnail'
    )
예제 #25
0
파일: znds.py 프로젝트: HeddaZ/kodi-addons
def guess():
    items = generate_items_from_page(
        fetch_url(urljoin(HOST, '/baidu.php')))

    items.append({
        'label': '换一批',
        'path': m.url_for('guess'),
    })

    return m.plugin.finish(items, view_mode='thumbnail')
예제 #26
0
파일: api.py 프로젝트: hanabi1224/Cynthia
def get_stocks_data_from_google(symbols):
    import json
    url = 'http://finance.google.com/finance/info?client=ig&q={0}'.format(','.join(symbols))
    print(url)
    r = fetch_url(url, 0)
    json_str = r.content.replace('/', '').strip()
    quotes = {}
    for quote in json.loads(json_str):
        quotes[quote['t'].upper()] = quote
    return quotes
예제 #27
0
def random_pair():
    host = 'http://es.wikipedia.org/w/api.php?'
    parameters = 'format=json&action=query&list=random&rnnamespace=0&rnlimit=2'
    url = host + parameters

    json_file = fetch_url(url)
    page_dictionary = json.load(json_file)

    return page_dictionary['query']['random'][0]['title'].encode(
        'utf-8'), page_dictionary['query']['random'][1]['title'].encode(
            'utf-8')
예제 #28
0
def fetch_friend_lists_for_characters(
        namespace, character_list: List[str],
        problematic_character_ids: List[int]) -> List[dict]:
    """
    Return the list of friend list responses from the server. Also return the list of character ids who couldn't be
    loaded due to errors!
    """

    fetch_logger().info(f"fetch_friend_lists_for_characters {character_list}")
    # Attempt to build a url for this set of characters and handle errors encountered along the way.
    unique_characters = list(set(character_list))

    if len(character_list) > 1:
        character_ids = ",".join(unique_characters)
    else:
        character_ids = str(character_list[0])

    friend_list_results = []

    url = f"http://census.daybreakgames.com/s:{SERVICE_ID}/get/{namespace}/characters_friend/" \
          f"?character_id={character_ids}&c:resolve=world"

    try:
        decoded = fetch_url(url)
        friend_list_results = decoded["characters_friend_list"]

    except GiveUpException as possible_overload_error:
        # Some characters have errors when you load the friends list. unclear why.
        if len(character_list) > 1:
            fetch_logger().error(
                f"Unable to load large group of ids: {character_list}")
            fetch_logger().error(str(possible_overload_error))
            for indi_index, individual in enumerate(character_list):
                fetch_logger().info(
                    f"Attempting to run individual {indi_index} ({individual})"
                )

                individual_results = fetch_friend_lists_for_characters(
                    namespace, [individual], problematic_character_ids)
                if len(individual_results) > 0:
                    friend_list_results.extend(individual_results)
                else:
                    fetch_logger().warning(
                        f"Unable to fetch data for player {individual} for whatever reason"
                    )

        elif len(character_list) == 1:
            problematic_character_ids.append(character_list)

    except Exception as err:
        fetch_logger().error(
            f"Unable to fetch friendlist for {character_list} {err} giving up and moving on"
        )
    return friend_list_results
예제 #29
0
 def fetch(self, gene):
     """Descarga el contenido de la página. NECESITA COOKIES PARA QUE ANDE."""
     url = BASE_GENECARDS_URL + gene
     logging.debug("Fetching URL " + url)
     headers = {
         "User-agent":
         "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0",
         "Cookie":
         "ASP.NET_SessionId=hwjs2brcx2cbpjod4gnxijlv; rvcn=H1d-sbuapwNh7VvK5OojnZx_GNvK8NV3Y_igQlT3X1auxx-CGR_50Kv2Gtv0Wo_OSexSGqUeuYFqw_sxZCt8GPagEGA1; ARRAffinity=166bde02ef81ff7e7ac9e9a57f0ef302100f353e9212ba930c859133d8b6d672; visid_incap_146342=ng+3dLHhQg+n22cnTIdnpyPQzV0AAAAAQUIPAAAAAADYkswfNl1m+lEO6s1+k/62; nlbi_146342=67fBdWPizFQhuWCUmewSQgAAAACX5WRNOa574GkShdKAhsHo; incap_ses_789_146342=QK2DWJBMgCysSvfvbRjzCiTQzV0AAAAA6eTm5xFAT0bApBqNQRJH0w==; _ga=GA1.2.752885262.1573769256; _gid=GA1.2.1465475919.1573769256; __gads=ID=bb532cbe1d9196bc:T=1573769276:S=ALNI_MZlxaQcjBdoHS5r7fq1pdgh3l_5cg; EU_COOKIE_LAW_CONSENT=true"
     }
     data = utils.fetch_url(url, headers)
     return data
예제 #30
0
    def download(self, remotefile, localpath=None, newfile=None):
        url = self.get_download_url(remotefile)
        if localpath:
            localfile = os.path.join(localpath,
                                     newfile or os.path.basename(remotefile))
            with open(localfile, 'wb') as fp:
                r = urllib2.urlopen(url)
                shutil.copyfileobj(r, fp)
            # or urllib retrieve
            return True

        else:
            return fetch_url(url)
예제 #31
0
파일: znds.py 프로젝트: HeddaZ/kodi-addons
def list_movies(link):
    content = fetch_url(urljoin(HOST, link))
    items = generate_items_from_page(content)

    match = re.search(
        r"<a class='nextPage' href='(.+?)'>下页</a>", content)  # noqa
    if match:
        items.append({
            'label': u'下一页',
            'path': m.url_for('list_movies', link=match.group(1))
        })

    return m.plugin.finish(items, view_mode='thumbnail')
예제 #32
0
파일: znds.py 프로젝트: HeddaZ/kodi-addons
def show_detail(page_id):
    url = '%sview-%s.html' % (HOST, page_id)
    content = fetch_url(url)

    items = [{
        'label': ''.join(match.groups()[1:]),
        'path': m.url_for('bdyun_link', link=url + match.group(1))
    } for match in re.finditer(r'<a class="btn btn-inverse pull-left".*?href="(.+?)".*?>(.+?)<span class="baidusp">(.+?)</span>\s*<span class="baidusp2">(.+?)</span>', content)]  # noqa

    if len(items) != 1:
        return m.plugin.finish(items)

    m.plugin.redirect(items[0]['path'])
예제 #33
0
파일: znds.py 프로젝트: peiit/kodi_plugins
def list_movies(link):
    content = fetch_url(urljoin(HOST, link))
    items = generate_items_from_page(content)

    match = re.search(r"<a class='nextPage' href='(.+?)'>下页</a>",
                      content)  # noqa
    if match:
        items.append({
            'label': u'下一页',
            'path': m.url_for('list_movies', link=match.group(1))
        })

    return m.plugin.finish(items, view_mode='thumbnail')
예제 #34
0
    def download(self, remotefile, localpath=None, newfile=None):
        url = self.get_download_url(remotefile)
        if localpath:
            localfile = os.path.join(localpath,
                                     newfile or os.path.basename(remotefile))
            with open(localfile, 'wb') as fp:
                r = urllib2.urlopen(url)
                shutil.copyfileobj(r, fp)
            # or urllib retrieve
            return True

        else:
            return fetch_url(url)
예제 #35
0
def get_titulo(pmid):
    '''
    Dado un pmid extrae su título de Pubmed si este existe.
    '''
    pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
    html = fetch_url(pubmed_url + pmid)
    for fila in html.split("\n"):
        try:
            clave = fila.index("<title>")
            titulo = fila[clave + 7:fila.index('- PubMed - NCBI', clave + 7)]
            return titulo
        except:
            pass
예제 #36
0
def get_link(article):
    host = 'http://es.wikipedia.org/w/api.php?'
    parameters = 'format=json&action=query&prop=info&titles='
    url = host + parameters + article
    json_file = fetch_url(url)
    link_dictionary = json.load(json_file)

    # Get page ID
    pageid = list(link_dictionary['query']['pages'])[0]
    # Page doesn't exist if ID is -1
    if pageid == '-1':
        return

    return 'http://es.wikipedia.org/wiki/%s' % article
예제 #37
0
def get_abstract(pmid):
    '''
    Dado un pmid extrae su abstract de Pubmed si este existe.
    '''
    pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
    html = fetch_url(pubmed_url + pmid)
    for fila in html.split("\n"):
        try:
            clave = fila.index('"abstr"')
            p = fila.index("<p>", clave)
            abstract = fila[p + 3:fila.index('</p>', p + 3)]
            return abstract
        except:
            pass
예제 #38
0
def get_doi(pmid):
    '''
    Dado un pmid extrae su DOI de Pubmed si este existe.
    '''
    pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
    html = fetch_url(pubmed_url + pmid)
    for fila in html.split("\n"):
        try:
            doi_key = fila.index("DOI:")
            href = fila.index('href="', doi_key)
            doi = fila[href + 16:fila.index('"', href + 16)]
            return doi
        except:
            pass
예제 #39
0
def get_keywords(pmid):
    '''
    Dado un pmid extrae su palabras clave de Pubmed si estas existen.
    '''
    pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
    html = fetch_url(pubmed_url + pmid)
    for fila in html.split("\n"):
        try:
            clave = fila.index("KEYWORDS:")
            p = fila.index("<p>", clave)
            abstract = fila[p + 3:fila.index('</p>', p + 3)]
            return abstract
        except:
            pass
예제 #40
0
def scrape_bill_document_from_sunlight(file_path):
    try:
        file_path = file_path.strip()

        #define path to write file
        out_file_path = file_path.split("/bills")[-1]
        out_file_path = re.sub("\s+", "_", out_file_path)
        out_dir_root_path = "{0}/scraped_bills".format(DATA_PATH)
        out_file_name = "{0}{1}.json".format(out_dir_root_path, out_file_path)

        bill_json = json.loads(codecs.open(file_path, encoding="utf8").read())

        # filter versions to be only the first and last
        try:
            bill_json['versions'] = [bill_json['versions'][0], bill_json['versions'][-1]]
        except IndexError:
            return

        base_url = "{0}/{1}".format("http://static.openstates.org/documents", bill_json['state'])
        urls = ["{0}/{1}".format(base_url, x['doc_id']) for x in bill_json['versions']]
        source_urls = [x['url'] for x in bill_json['versions']]

        for i, url in enumerate(urls):

            bill_document = utils.fetch_url(url)

            #hash bill using base64
            if bill_document is not None:
                bill_document = base64.b64encode(bill_document)
            else:
                logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(
                    file_path, url, i, "link error"))

            bill_json['versions'][i]['bill_document'] = bill_document

        if not os.path.exists(os.path.dirname(out_file_name)):
            os.makedirs(os.path.dirname(out_file_name))
        with codecs.open(out_file_name, "w", encoding="utf8") as f:
            f.write(json.dumps(bill_json))

        logging.info("successfully scraped bill: {0}".format(out_file_path))

    except Exception as e:
        trace_message = re.sub("\n+", "\t", traceback.format_exc())
        trace_message = re.sub("\s+", " ", trace_message)
        trace_message = "<<{0}>>".format(trace_message)
        m = "Failed to obtain documents for {0}: {1}".format(file_path, trace_message)
        logging.error(m)

    return
예제 #41
0
    def test_fetch_url(self):
        self.assertEqual(None, utils.fetch_url("no such thing"))
        self.assertEqual(None, utils.fetch_url("http://no-such-thing-hopefully.abc/asdasd123"))

        def urlopen_mock(html: bytes, code: int):
            class MockResponse:
                def read(self):
                    return html

                def getcode(self):
                    return code

            @contextmanager
            def urlopen(url):
                yield MockResponse()

            return urlopen

        with patch("urllib.request.urlopen", urlopen_mock(b"works", 200)):
            self.assertEqual(b"works", utils.fetch_url("http://canonical.com/"))

        # return `None` for any code >= 400
        with patch("urllib.request.urlopen", urlopen_mock(b"some message", 400)):
            self.assertEqual(None, utils.fetch_url("http://canonical.com/"))
예제 #42
0
파일: znds.py 프로젝트: peiit/kodi_plugins
def show_detail(page_id):
    url = '%sview-%s.html' % (HOST, page_id)
    content = fetch_url(url)

    items = [{
        'label': ''.join(match.groups()[1:]),
        'path': m.url_for('bdyun_link', link=url + match.group(1))
    } for match in re.finditer(
        r'<a class="btn btn-inverse pull-left".*?href="(.+?)".*?>(.+?)<span class="baidusp">(.+?)</span>\s*<span class="baidusp2">(.+?)</span>',
        content)]  # noqa

    if len(items) != 1:
        return m.plugin.finish(items)

    m.plugin.redirect(items[0]['path'])
예제 #43
0
def get_link(article):
    host = 'http://es.wikipedia.org/w/api.php?'
    parameters = 'format=json&action=query&prop=info&titles='
    url = host + parameters + article
    json_file = fetch_url(url)
    link_dictionary = json.load(json_file)
    

    # Get page ID
    pageid = list(link_dictionary['query']['pages'])[0]
    # Page doesn't exist if ID is -1
    if pageid == '-1':
        return
    
    return 'http://es.wikipedia.org/wiki/%s' % article
예제 #44
0
def fetch(article_name):
    host = 'http://es.wikipedia.org/w/api.php?'
    parameters = 'format=json&action=query&prop=extracts&exsentences=3&explaintext=true&titles='
    url = host + parameters + urllib2.quote(article_name)

    json_file = fetch_url(url)
    page_dictionary = json.load(json_file)

    # Handle the varying pageid key in the JSON file provided by MediaWiki
    pageid = list(page_dictionary['query']['pages'])[0]
    # Page does not exist
    if pageid == '-1':
        return "The article %s doesn't exist." % (article_name)

    extract = page_dictionary['query']['pages'][pageid]['extract']
    return extract.encode('utf-8')
예제 #45
0
def fetch(article_name):
    host = 'http://es.wikipedia.org/w/api.php?'
    parameters = 'format=json&action=query&prop=extracts&exsentences=3&explaintext=true&titles='
    url = host + parameters + urllib2.quote(article_name)

    json_file = fetch_url(url)
    page_dictionary = json.load(json_file)
    
    # Handle the varying pageid key in the JSON file provided by MediaWiki
    pageid = list(page_dictionary['query']['pages'])[0]
    # Page does not exist
    if pageid == '-1':
        return "The article %s doesn't exist." % (article_name)

    extract = page_dictionary['query']['pages'][pageid]['extract']
    return extract.encode('utf-8')
예제 #46
0
def scrape_bill_document_from_original_source(filePath):
    filePath = filePath.strip()

    outFilePath = "/".join(filePath.split("/")[7:])
    outFilePath = re.sub("\s+", "_", outFilePath)
    outDirRootPath = "/mnt/data/sunlight/dssg/scraped_bills_new"
    outFileName = "{0}/{1}.json".format(outDirRootPath, outFilePath)

    billFile = codecs.open(filePath, encoding="utf8").read()
    billJson = json.loads(billFile)

    # filters documents that are resolutions
    bill_text_count = [1 for x in billJson['type'] if "bill" in x.lower()]
    if sum(bill_text_count) < 1:
        return

    # filter versions to be only the first and last
    billJson['versions'] = [billJson['versions'][0], billJson['versions'][-1]]

    urls = [x['url'] for x in billJson['versions']]

    for i, url in enumerate(urls):

        billDocument = utils.fetch_url(url)

        if billDocument is not None:
            billDocument = base64.b64encode(billDocument)
        else:
            logging.error("file {0}, url {1}, version {2}, error: << {3} >>".format(filePath, url, i, "link error"))

        billJson['versions'][i]['bill_document'] = billDocument

    if not os.path.exists(os.path.dirname(outFileName)):
        os.makedirs(os.path.dirname(outFileName))
    with codecs.open(outFileName, "w", encoding="utf8") as f:
        f.write(json.dumps(billJson))

    logging.info("successfully scraped bill: {0}".format(outFilePath))

    return
예제 #47
0
    def leader_board_sample(self, limit=50):
        """
        I have used more than one method to get the initial list of
        character Ids. The first version simply used the id's of characters
        I was knew of. This new version is a bit less biased It gathers the
        players who were in the top limit places on the current leader-board
        for all areas of the leader-board available.
        Note that all leader-board stats are strongly correlated.
        """

        seed_ids = []
        for leaderboard_type in ["Kills", "Time", "Deaths", "Score"]:
            fetch_logger().info(f"Fetching {leaderboard_type} {limit}")
            url = f"http://census.daybreakgames.com/s:{SERVICE_ID}/get/" \
                  f"{self.namespace}/leaderboard/?name={leaderboard_type}" \
                  f"&period=Weekly&world={self.server_id}&c:limit={limit}"

            fetch_logger().info(url)
            decoded = fetch_url(url)
            try:
                decoded_leaderboard = decoded["leaderboard_list"]
            except Exception as err:
                # fetch_logger().error(decoded)
                fetch_logger().error(url)
                fetch_logger().error(f"Failed with {err}")
                raise err
            for characters in decoded_leaderboard:
                character_id = characters.get("character_id")
                if character_id is not None:
                    seed_ids.append(character_id)
        unique = list(set(seed_ids))
        # Record the starting nodes for debugging. The busy_timeout prevents
        # a issue where sqlite3 was not waiting long enough.
        # It probably isn't needed but....
        self.archive_connection.execute("PRAGMA busy_timeout = 30000")
        self.archive_connection.execute(
            "INSERT INTO seed_nodes (name,seed_nodes) VALUES(?,?)",
            (self.table_name, ",".join(unique)),
        )
        return seed_ids
예제 #48
0
    def _fetch_pcsapi(self, path, params=None, data=None, headers={}):
        assert self._bduss is not None
        url = urljoin(self._pcsapi_baseUrl, path) + "?app_id=266719"
        if params:
            url += "&" + urlencode(params)
        headers["Cookie"] = "BDUSS=" + self._bduss

        try:
            r = fetch_url(url, data, headers)
        except urllib2.HTTPError as e:
            try:
                error_content = e.read()
                if e.headers.get("content-encoding") == "gzip":
                    error_content = unzip(error_content)
                eo = json.loads(error_content)

            except:
                raise e
            else:
                raise PCSApiError(eo.get("error_code"), eo.get("error_msg"))

        return json.loads(r)
예제 #49
0
def find_article(url, end_article):
    json_file = fetch_url(url)
    link_dictionary = json.load(json_file)
    
    # Handle the varying pageid key in the JSON file provided by MediaWiki
    pageid = list(link_dictionary['query']['pages'])[0]
    # Page does not exist
    if pageid == '-1':
        return False

    article_entry = {'ns': 0, 'title': end_article}
    if article_entry in link_dictionary['query']['pages'][pageid]['links']:
        return True

    # Continue if link list is not complete yet
    if 'query-continue' in list(link_dictionary):
        continue_string = urllib2.quote(link_dictionary['query-continue']['links']['plcontinue'])
        ''' Endless appending of plcontinues, can be improved '''
        new_url = url + '&plcontinue=' + continue_string
        return find_article(new_url, end_article)
    else:
        return False
예제 #50
0
    def _fetch_pcsapi(self, path, params=None, data=None, headers={}):
        assert self._bduss is not None
        url = urljoin(self._pcsapi_baseUrl, path) + '?app_id=266719'
        if params:
            url += '&' + urlencode(params)
        headers['Cookie'] = 'BDUSS=' + self._bduss

        try:
            r = fetch_url(url, data, headers)
        except urllib2.HTTPError as e:
            try:
                error_content = e.read()
                if e.headers.get('content-encoding') == 'gzip':
                    error_content = unzip(error_content)
                eo = json.loads(error_content)

            except:
                raise e
            else:
                raise PCSApiError(eo.get('error_code'), eo.get('error_msg'))

        return json.loads(r)
예제 #51
0
    def _fetch_pcsapi(self, path, params=None, data=None, headers={}):
        assert self._bduss is not None
        url = urljoin(self._pcsapi_baseUrl, path) + '?app_id=266719'
        if params:
            url += '&' + urlencode(params)
        headers['Cookie'] = 'BDUSS=' + self._bduss

        try:
            r = fetch_url(url, data, headers)
        except urllib2.HTTPError as e:
            try:
                error_content = e.read()
                if e.headers.get('content-encoding') == 'gzip':
                    error_content = unzip(error_content)
                eo = json.loads(error_content)

            except:
                raise e
            else:
                raise PCSApiError(eo.get('error_code'), eo.get('error_msg'))

        return json.loads(r)
예제 #52
0
    def _fetch_pcsapi(self, path, params=None, data=None, headers={}):
        url = urljoin(self._pcsapi_baseUrl, path) + '?'
        if params:
            url += urlencode(params) + '&'
        url += 'access_token=' + self._access_token

        try:
            r = fetch_url(url, data, headers)
        except urllib2.HTTPError as e:
            try:
                error_content = e.read()
                if e.headers.get('content-encoding') == 'gzip':
                    error_content = gzip.GzipFile(fileobj=StringIO(
                        error_content), mode='rb').read()
                eo = json.loads(error_content)

            except:
                raise e
            else:
                raise PCSApiError(eo.get('error_code'), eo.get('error_msg'))

        return json.loads(r)
예제 #53
0
    def login(self):
        page = fetch_url(self.LOGIN_URL)
        if self.logged_in(page):
            return True
        else:
            forms = ParseResponse(mechanize.urlopen(self.LOGIN_URL),
                                  backwards_compat=False)
            form = forms[1]
            form['username'] = self.username
            form['password'] = getpass()
            form.find_control("cookietime").items[0].selected = True

            request = form.click()
            try:
                response = mechanize.urlopen(request)
            except mechanize.HTTPError, response2:
                exit('HTTP error while logging in.')

            content = response.read()
            if self.logged_in(content):
                return True
            else:
                return False
예제 #54
0
 def setup(self):
     if os.path.isfile(self.__filename):
         return True
     else:
         return utils.fetch_url(self.__url, self.__filename)
예제 #55
0
 def _download_img(self):
     imgdata = utils.fetch_url(self._imgurl, timeout=10)
     xbmcvfs.File(self._tmp_imgfile, 'w').write(imgdata)
예제 #56
0
def run_command(message_data):
    sender = message_data['sender']
    said = message_data['said']
    # '#channel' if room, 'sender' if private message
    current_channel = message_data['current_channel']
    params = message_data['params']

    # Get title from web pages
    if 'http://' in said:
        url = extract_url(said)
        title = get_title(url)
        if title:
            say(current_channel, 'Title: %s' % title)

    # Get link to Wikipedia article
    if '[[' in said:
        for article_name in extract_article(said):
            say(current_channel, get_link(article_name))

    # Reply to mention with a random quote
    if nickname in said:
        say(current_channel, random_quote(sender))

    ## IRC commands ##
    search_term = '+'.join(params)
    
    # List all commands
    if said.find('@help') == 0:
        say(sender, 'Search engines: google, wa, ddg, drae, dpd, en, es')
        say(sender, 'Misc: random [list], conv (unit) to (unit), fetch (wikipedia_article), link <start|get|check|stop>, calc (expression)')

    # Google
    elif said.find('@google') == 0:
        say(current_channel, 'https://www.google.com/search?q=%s' % search_term)

    # Wolfram Alpha
    elif said.find('@wa') == 0:
        say(current_channel, 'http://www.wolframalpha.com/input/?i=%s' % search_term)

    # DuckDuckGo
    elif said.find('@ddg') == 0:
        say(current_channel, 'http://duckduckgo.com/?q=%s' % search_term)

    # DRAE
    elif said.find('@drae') == 0:
        say(current_channel, 'http://lema.rae.es/drae/?val=%s' % search_term)

    # DPD
    elif said.find('@dpd') == 0:
        say(current_channel, 'http://lema.rae.es/dpd/?key=%s' % search_term)

    # Jisho kanji lookup
    elif said.find('@kan') == 0:
        escaped_term = urllib2.quote(search_term)
        say(current_channel, 'http://jisho.org/kanji/details/%s' % escaped_term)

    # EN > JP
    elif said.find('@ei') == 0:
        say(current_channel, 'http://jisho.org/words?jap=&eng=%s&dict=edict' % search_term)

    # JP > EN
    elif said.find('@ni') == 0:
        escaped_term = urllib2.quote(search_term)
        say(current_channel, 'http://jisho.org/words?jap=%s&eng=&dict=edict' % escaped_term)

    # EN > ES
    elif said.find('@en') == 0:
        say(current_channel, 'http://www.wordreference.com/es/translation.asp?tranword=%s' % search_term)

    # ES > EN
    elif said.find('@es') == 0:
        say(current_channel, 'http://www.wordreference.com/es/en/translation.asp?spen=%s' % search_term)

    # Random choice
    elif said.find('@random') == 0:
        if len(params) == 1:
            say(current_channel, 'f****t')
        elif len(params) > 1:
            say(current_channel, random.choice(said.split(',').strip()))
        else:
            say(current_channel, random.choice([0, 1]))

    # Unit converter
    elif said.find('@conv') == 0:
        if 'to' not in params:
            return
        index = params.index('to')
        amount = params[0]
        unit_from = params[1:index]
        unit_from = urllib2.quote(' '.join(unit_from))
        # 'to' == params[index]
        unit_to = params[index + 1:]
        unit_to = urllib2.quote(' '.join(unit_to))

        conversion_url = 'http://www.google.com/ig/calculator?hl=en&q='

        conversion = fetch_url(conversion_url + amount + unit_from + '=?' + unit_to).read()
        parsed_conversion = conversion.split('"')

        # Check for errors
        if len(parsed_conversion[5]) == 0:
            unit_result = urllib2.unquote(unit_to)
            say(current_channel, '%s %s' % (parsed_conversion[3].split()[0], unit_result))

    # Linkrace module
    elif said.find('@link') == 0:
        # Get race links
        if params[0] == 'get':
            url = 'http://es.wikipedia.org/wiki/%s'
            start, end = random_pair()
            starturl = url % urllib2.quote(start)
            endurl = url % urllib2.quote(end)
            say(current_channel, 'Start article is %s' % starturl)
            say(current_channel, 'Goal article is %s' % endurl)

        # Check if chain is valid
        elif params[0]  == 'check':
            chain = ' '.join(params[1:])
            broken_links = check_chain(chain)
            if not broken_links:
                say(current_channel, 'The chain is valid.')
            else:
                error_list = ' | '.join(broken_links)
                say(current_channel, error_list)
                say(current_channel, 'The chain is not valid.')

    # Calculator
    elif said.find('@calc') == 0:
        expression = ''.join(params)
        result = str(calculate(expression))
        say(current_channel, result)

    # Wikipedia fetch
    elif said.find('@fetch') == 0:
        article_name = ' '.join(params)
        extract = fetch(article_name)
        say(current_channel, extract)

    # Text game
    elif said.find('@dicks') == 0:
        global game
        # Commands available for everyone
        if params[0] == 'join':
            game.join_game(sender)
        elif params[0] == 'players':
            say(current_channel, [player.name for player in game.players])
        # Commands available for players
        if sender in [player.name for player in game.players]:
            if params[0] == 'panel':
                panel_url = sprunge(game.panel(sender))
                say(sender, '[i] Uploading panel')
                say(sender, panel_url)
            elif params[0] == 'settle':
                group = params[1]
                game.settle(sender, group)
            elif params[0] == 'move':
                troop = params[1]
                new_position = [params[2], params[3]]
                game.move(sender, troop, new_position)
    
    ## Owner commands ##
    if sender == owner:
        # Disconnect
        if said == '.quit':
            execute('QUIT')
            sys.exit(0)
        
        # Send message from bot
        elif said.find('.say') == 0:
            if len(params) > 1:
                say(params[0], ' '.join(params[1:]))

        # Print userlist
        elif said.find('.users') == 0:
            say(current_channel, str(users))

        # Bot joins
        elif said.find('.join') == 0:
            channel = params[0]
            execute('JOIN %s' % channel)

        # Bot parts
        elif said.find('.part') == 0:
            execute('PART %s' % current_channel)
            del users[current_channel]

        # Bot kicks
        elif said.find('.kick') == 0:
            user = params[0]
            reason = ' '.join(params[1:])
            if not reason:
                reason = 'huh'
            bot_kick(current_channel, user, reason)
예제 #57
0
def get_metrics(start, end, author = None, config = None):
    start = datetime.strptime(start, "%Y-%m-%d %H:%M:%S")
    end = datetime.strptime(end, "%Y-%m-%d %H:%M:%S")

    metrics = {
        'commits' : 0,
        'git_push' : 0,
        'issues' : {
            'opened' : 0,
            'closed' : 0,
        },
        'merge_requests' : {
            'opened' : 0,
            'closed' : 0,
        },
    }

    if config is None:
        config = get_config(os.environ['HOME'] + '/.qe-metrics/gitlab.conf')

    url = config['url']
    user = config['username']
    token = config['private_token']

    if not author:
        author = user

    ##### NEW issues opened during period
    atom_url = '%s/u/%s.atom?private_token=%s' % (url, author, token)
    dom = parseString(fetch_url(atom_url))

    for entry in dom.getElementsByTagName("entry"):
        title = entry.getElementsByTagName("title")[0].firstChild.wholeText
        updated_on = entry.getElementsByTagName("updated")[0].firstChild.wholeText
        updated_on = datetime.strptime(updated_on, "%Y-%m-%dT%H:%M:%SZ")

        # skip older or newer events
        if not (start <= updated_on <= end):
            continue

        if title.find('pushed') > -1:
            metrics['git_push'] += 1

            # commits is a bit wrong.
            # when pushing to a new branch for the first time
            # GitLab reports commits from other users as well,
            # for example when merging to the latest upstream
            # this can be fixed by a second parameter, the user real name
            # as it appears in the commits, but it doesn't work
            # nicely with --author and isn't that important for now!
            for summary in entry.getElementsByTagName("summary"):
                for a in summary.getElementsByTagName("a"):
                    href = a.getAttribute('href')
                    if href.find('/commit/') > -1:
                        metrics['commits'] += 1
        elif title.find('opened issue') > -1:
            metrics['issues']['opened'] += 1
        elif title.find('closed issue') > -1:
            metrics['issues']['closed'] += 1
        elif title.find('opened MR') > -1:
            metrics['merge_requests']['opened'] += 1
        elif title.find('accepted MR') > -1:
            metrics['merge_requests']['closed'] += 1



    return metrics