def get_location(id):
    HEADERS_LIST = [
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
        'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
        'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
    ]

    session = requests.Session()
    browser = RoboBrowser(session=session,
                          user_agent=random.choice(HEADERS_LIST),
                          parser='lxml')
    first_url = "https://twitter.com/intent/user?user_id=" + str(id)
    browser.open(first_url)
    results = browser.find_all("span", {"class": "nickname"})
    if results is not None and len(results) is not 0:
        handle = " ".join(str(results[0].text).split())
        url = "https://twitter.com/" + handle
        browser.open(url)
        results = browser.find_all(
            "span", {"class": "ProfileHeaderCard-locationText u-dir"})
        if results is not None and len(results) is not 0:
            return " ".join(str(results[0].text).split())
    return None
예제 #2
0
def scrape_cosmo_exam(url,email,password):
    browser = RoboBrowser()
    browser.open(tlink)
    search = browser.get_form()
    search[ 'user[email]' ] = email
    search[ 'user[password]' ] = password
    browser.submit_form(search,submit=search.submit_fields['commit'])

    # main page
    browser.follow_link(browser.find_all('a')[2])
    browser

    #browser.get_links()
    all_links = browser.find_all('a')
    announcements_key = list(filter( lambda x: 'Announcements' in x, all_links ))[0]
    announcement_ind = all_links.index(announcements_key)
    browser.follow_link(browser.find_all('a')[announcement_ind])
    
    #obtaining title objects - tags
    titles =mapper( lambda x:date_extract(1) , browser.find_all('h2'))[0]

    # helper function 2
    def date_extract(ind):
        return list(mapper( lambda x:list(x.children)[1], browser.find_all('h2') ))

    # helper function 3
    def matcher(lst,*matches):
        if not matches:
            matches = ['exam','reminder']
        else:
            matches=matches[0]

        return filterer(lambda x:any(string.lower() in str(x).lower() for string in matches) ,lst)

    return titles
예제 #3
0
class Question(object):
    """ Zhihu parser, question obj"""
    def __init__(self, page_url):
        self.url = page_url
        self.browser = RoboBrowser(history=True, user_agent='nemo1')
        self.browser.open(self.url)

    def get_answer_count(self):
        if self.browser.find("h3", id="zh-question-answer-num") != None:
            return int(
                self.browser.find("h3",
                                  id="zh-question-answer-num")["data-num"])

    def get_all_answer_url_list(self):
        results = []
        if self.get_answer_count() <= 10:
            for answer_div in self.browser.find_all(
                    "div", class_="zm-item-answer  zm-item-expanded"):
                results.append(URL_PREFIX + answer_div.find("link")["href"])
        else:
            for i in range(0, (self.get_answer_count() / 10) + 1):
                offset = i * 10
                if i == 0:
                    for answer_div in self.browser.find_all(
                            "div", class_="zm-item-answer  zm-item-expanded"):
                        results.append(URL_PREFIX +
                                       answer_div.find("link")["href"])
                    # print results
                else:
                    # pass
                    post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
                    _xsrf = self.browser.find("input",
                                              attrs={'name': '_xsrf'})["value"]
                    params = json.dumps({
                        "url_token":
                        int(self.url[-8:-1] + self.url[-1]),
                        "pagesize":
                        10,
                        "offset":
                        offset
                    })
                    data = {'_xsrf': _xsrf, 'method': "next", 'params': params}
                    header = {
                        'User-Agent':
                        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
                        'Host': "www.zhihu.com",
                        'Referer': self.url
                    }
                    r = requests.post(post_url,
                                      data=data,
                                      headers=header,
                                      verify=False)
                    answers = r.json()["msg"]
                    # print len(answers)
                    # pdb.set_trace()
                    for ans in answers:
                        soup = BeautifulSoup(ans, 'html.parser')
                        results.append(URL_PREFIX + soup.find("link")["href"])
        return results
예제 #4
0
class Downloader():
    def __init__(self, proxy=None, worker_num=0):
        self.worker_num = worker_num
        session = Session()
        if proxy is not None:
            session.proxies = {'http': proxy, 'https': proxy}
        self.browser = RoboBrowser(history=True,
                                   parser='html.parser',
                                   session=session)

    def get_download_link(self, book_url):
        self.browser.open(book_url)
        for link in self.browser.find_all("a"):
            if "download.php?t=1" in str(link):
                return f"https://www.lectulandia.cc{link['href']}"

    def download_book(self, download_url):
        self.browser.open(download_url)
        pattern = re.compile("var linkCode = \"(.*?)\";")
        section = pattern.findall(str(self.browser.parsed))
        bee_url = f'https://www.beeupload.net/file/{section[0]}'
        self.browser.open(bee_url)
        try:
            filename = self.browser.find(
                "div", id="fileDescription").find_all("p")[1].text.replace(
                    "Name: ", "")

            size = self.browser.find(
                "div", id="fileDescription").find_all("p")[2].text
            file_url = self.browser.find("a", id="downloadB")
            time.sleep(2)
            self.browser.follow_link(file_url)
            with open(f"books/{filename}", "wb") as epub_file:
                epub_file.write(self.browser.response.content)
            return filename, size
        except:
            print(self.browser.parsed)

    def get_book_page_list(self, page):
        self.browser.open(f'https://www.lectulandia.cc/book/page/{page}/')
        return [
            f"https://www.lectulandia.cc{book['href']}"
            for book in self.browser.find_all("a", class_="card-click-target")
        ]

    def download_full_page(self, page):
        print(f"Downloading page: {page} ")
        books = self.get_book_page_list(page)
        for book in books:
            time.sleep(2)
            download_url = self.get_download_link(book)
            print(f"Worker: {self.worker_num} on page: {page}",
                  self.download_book(download_url))
예제 #5
0
def scrape_cs2040s(url, email, password):
    browser = RoboBrowser(parser='html.parser')
    browser.open(url)
    search = browser.get_form()
    search['user[email]'] = str(email)
    search['user[password]'] = str(password)
    browser.submit_form(search, submit=search.submit_fields['commit'])

    # main page
    browser.follow_link(browser.find_all('a')[2])

    # missions
    browser.follow_link(browser.find_all('a')[11])

    # find names
    reduced = filterer(lambda x: len(list(x.children)) >= 1,
                       browser.find_all('th'))
    reduced = filterer(lambda x: 'colspan' in x.attrs, reduced)
    # unsure of object structure so convert to list type and assess last element
    names = mapper(lambda x: list(list(x.children)[-1])[-1], reduced)

    # find deadlines
    deadlines_tags = list(
        filter(lambda x: x['class'] == ['table-end-at'],
               browser.find_all('td')))
    deadlines = list(
        map(lambda x: (list(x))[0] if list(x) else 'not yet', deadlines_tags))
    curr_yr = datetime.now().year

    #returns a list of datetime objects
    dates = mapper(
        lambda x: str(datetime.strptime(f"{curr_yr} {x}", '%Y %d %b %H:%M'))
        if x != 'not yet' else 'Not yet', deadlines)

    array = []
    for n, d in zip(names, dates):
        dic1 = {}
        dic1['title'] = n
        dic1['datetime'] = d
        array.append(dic1)

    dic = {}
    dic['data'] = array

    #scrape exam details
    with open(
            '/Users/sherrywu1999/Desktop/untitled/callie/python/deadlines/data.json',
            'w') as json_file:
        json.dump(dic, json_file)
예제 #6
0
    def getData(self):

        # temp
        sqlString = "DELETE FROM coin_toomics WHERE date='%s'" % (self.todayString)
        self.dbconn.cur.execute(sqlString, )
        # temp

        browser = RoboBrowser(history=True,
            user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/33.0.1750.152 Chrome/33.0.1750.152 Safari/537.36')

        # toomics login ---------------------------------------------------------------- start
        auth_url = 'http://m.toomics.com/auth/layer_login'
        data= {'user_id': 'userid value', 'user_pw': 'password value', 'iSaveUserId': 'true', 'iKeepCookie': 'true', 'returnUrl': '', 'direction': '' }
        browser.open(auth_url, method='post', data=data)
        # # toomics login ---------------------------------------------------------------- end

        url_content = 'http://m.toomics.com/mypage/charge'
        browser.open(url_content)
        self.coinTopElements = browser.find_all('ul', class_='list-charge')

        self.product_title_list, self.product_price_list = [], []

        for idx, cel in enumerate(self.coinTopElements):
            # print cel

            for pt in cel.find_all('div', class_='coin-item'):
                print pt.get_text()
                self.product_title_list.append(pt.get_text())

            for pp in cel.find_all('span', class_='price'):
                print pp.get_text()
                self.product_price_list.append(pp.get_text())
예제 #7
0
def processPage(pageVal):
    ageValues = []
    noAgeCount = 0
    usersProcessed = 0

    browser = RoboBrowser(parser='html.parser')
    browser.open(baseURL + str(pageVal) + options)

    #Find all the links on the page, if the href in a link has member.php in it then
    #it's a member link, so pull out the uid from it and add that uid to the list
    userIDs = [
        link['href'].split('u=')[1]
        for link in browser.find_all('a', href=True)
        if 'member.php?' in link['href']
    ]
    usersProcessed = len(userIDs)

    for user in userIDs:
        age = getUserAge(user)
        time.sleep(1)
        noAgeCount += age is None

        if age is not None:
            ageValues.append(age)

    return [ageValues, noAgeCount, usersProcessed]
예제 #8
0
def GetTranslations(wordToFind):
    print("\nGet translations for " + wordToFind)
    # Browser
    browser = RoboBrowser(history=True, parser="html5lib")

    #Open the login page. Setup our login info. Submit the info.
    print ("Connecting to Ord.se ...")
    browser.open('http://www.ord.se/oversattning/engelska/?s='+wordToFind+'&l=SVEENG')
    definitions = browser.find_all(class_="search-result-word-wrapper")

    Translations = []
    #Get rid of defitions without class... :)
    for definition in definitions:
        #print(definition)
        #print()
        wordClasses = definition.find_all(class_="word-class")
        if not wordClasses:
            definitions.remove(definition)
        else:
            for wordClass in wordClasses:
                wordClass = wordClass.getText()
                #print(definition)
                #print("\n")
                actualSearchWord = definition.find(class_="search-result-head-word").getText()
                actualSearchWord = actualSearchWord.rstrip()
                htmlDefinitions = definition.find_all(class_="normal font1 readable")
                translations = []
                for translation in htmlDefinitions:
                    translations.append(translation.getText())
                Translations.append(Translation(actualSearchWord, wordClass, translations))
    return Translations
예제 #9
0
def HentaiMS(keyword, PageNum):

    all_comic_links = []

    base_URL = 'http://search.hentai.ms/?tag=' + keyword + '&num=' + PageNum + '8&related=&pages=&box=14&es=14'
    print(base_URL)
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')

    browser.open(base_URL)

    td = browser.find_all('td', {'id': 'search_gallery_item'})
    print(len(td))

    for line in td:

        this_link = line.find_all('a', href=True)

        try:
            if 'tags' not in this_link[1]['href']:
                print(this_link[1]['href'])
                all_comic_links.append(this_link[1]['href'])
        except:
            pass

    return all_comic_links
예제 #10
0
def get_source_code(commitId, project):
    import random
    import requests
    from robobrowser import RoboBrowser

    HEADERS_LIST = [
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
        'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
        'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
    ]

    link = []

    session = requests.Session()
    browser = RoboBrowser(session=session,
                          user_agent=random.choice(HEADERS_LIST),
                          parser="lxml")
    url = "https://github.com/" + project.replace("-",
                                                  "/") + "/commit/" + commitId

    browser.open(url + "?diff=unified")
    results = browser.find_all("a")
    for item in results:
        if ".java" in str(item):
            second_url = "https://raw.githubusercontent.com/" + project.replace(
                "-", "/") + "/" + commitId + "/" + item.string
            browser.open(second_url)
            return browser.find().text
예제 #11
0
파일: FenH.py 프로젝트: Selezar211/sdfsdf
def FenHen(keyword):

    result_urls = []

    base_URL = 'http://fenhentai.blogspot.co.uk/search?q=' + keyword
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')

    while True:
        browser.open(base_URL)

        post_body_list = browser.find_all('div',
                                          {'class': 'post-body entry-content'})

        for post in post_body_list:
            this_image = post.find('img', src=True)
            print(this_image['src'])
            result_urls.append(this_image['src'])

        Next_Post_Link = browser.find('a', {'class': 'blog-pager-older-link'},
                                      href=True)

        if (Next_Post_Link == None):
            break
        else:
            base_URL = Next_Post_Link['href']

    return result_urls
def desktop(keyword, sitename, useragent):
    parser = 'html.parser'

    browser = RoboBrowser(history=False, user_agent=useragent, parser=parser)

    browser.open('https://www.google.com/search?num=100&q=' + keyword)

    # links = browser.find_all("div", {"class": "KJDcUb"})

    #desktop div where URLs are
    links = browser.find_all("div", {"class": "g"})

    counter = 0

    print('The user Agent you used was ----> ' + useragent)

    d = []
    for i in links:
        counter = counter + 1
        if sitename in str(i):
            url = i.find_all('a', href=True)
            position = "%d" % (counter)
            rank = "%s" % (url[0]['href'])
            now = datetime.date.today().strftime("%d-%m-%Y")
            keyword = keyword
            d.append(keyword)
            d.append(position)
            d.append(rank)
            d.append(now)
            print(keyword, position, rank, now)

    csv_export(d)
예제 #13
0
def get_residences():
    """
    Gets raw residence data from Columbia housing website, standardizes
    and cleans the data, and uploads to the database
    """
    browser = RoboBrowser()
    residences_list = []

    # makes list of links to each residence hall page
    browser.open(home_url)
    table_headers = browser.find_all(class_='views-field-title')[1:]
    residence_links = list(map(lambda x: x.find('a')['href'], table_headers))

    for link in residence_links:
        browser.open(base_url + link)
        residence_json = parse_residence_info(browser)
        if residence_json:
            if type(residence_json) == list:
                residences_list.extend(residence_json)
            else:
                residences_list.append(residence_json)

    if not os.path.isfile('app/data.sqlite'):
        print("Creating database")
        db.create_all()

    collate_data(residences_list)  # here just for data analysis
    print("Uploading residences to database")
    upload_residences_to_db(residences_list)
예제 #14
0
def get_residences():
    """
    Gets raw residence data from Columbia housing website, standardizes
    and cleans the data, and uploads to the database
    """
    browser = RoboBrowser()
    residences_list = []

    # makes list of links to each residence hall page
    browser.open(HOME_URL)
    table_headers = browser.find_all(class_='views-field-title')[1:]
    residence_links = list(map(lambda x: x.find('a')['href'], table_headers))

    for link in residence_links:
        browser.open(BASE_URL + link)
        residence_json = parse_residence_info(browser)
        residences_list.extend(residence_json)

    if not os.path.isfile('app/data.sqlite'):
        print("Creating database")
        db.create_all()

    # uncomment to see collected data
    # collate_data(residences_list)

    print("Uploading residences to database")
    for res in residences_list:
        upload_object_to_db(Residence, res)
예제 #15
0
def ExtractONEPAGE(page):

    final_res = ''
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')

    while True:
        print('loop')
        browser.open(
            'http://tools.prowebguru.com/free-online-image-extractor/free_online_image_extractor_tool.php'
        )

        form = browser.get_forms({'class': 'form-horizontal'})

        if len(form) != 0:
            print('broke')
            break

    this_form = form[0]

    this_form["website"] = page

    browser.submit_form(this_form)

    img_links = browser.find_all('img', src=True)

    for line in img_links:
        if '/tbn/' not in line['src'] and '.wp.com' in line['src']:
            final_res = line['src']
            print(final_res)

    if final_res != '':
        with open('HenRUniqueComic.txt', 'a') as f:
            f.write(final_res + '\n')
예제 #16
0
def GetWordForms(word, wordClass):
    print("\nGetting the forms of " + word + " (" + wordClass + ")")
    #Convert the ord class to wiktionary table class form
    wordClassDict = {'SUBSTANTIV':'subst', 'VERB':'verb', 'TRANSITIVT VERB':'verb', 'INTRANSITIVT VERB':'verb', 'INTRANSITIVT DEPONENSVERB':'verb', 'ADVERB':'adverb', 'ADJEKTIV':'adj'}
    browser = RoboBrowser(history=True, parser="html5lib")
    browser.open('http://sv.wiktionary.org/wiki/'+word)
    tableClass = "template-sv-" + wordClassDict[wordClass.upper()]

    #wordFormTable = browser.find_all(class_=re.compile(r"grammar\s+"))
    wordFormTable = browser.find_all(class_=re.compile(tableClass))

    forms = [word]
    if wordFormTable:
        forms = []
        for table in wordFormTable:
            tableheader = table.find("tbody")
            tableSiblings = tableheader.find_all(class_=re.compile("b-"))
            for sibling in tableSiblings:
                if not sibling.getText() in forms and sibling.getText().isalpha():
                    forms.append(sibling.getText())
                    
        formIndex = 1
        print ("Choose the forms to use; ")
        for form in forms:
            print(str(formIndex) + ". " + form)
            formIndex = formIndex + 1
        keepIndexes = list(map(int, input().split()))

        if len(keepIndexes) > 0:
            finalForms = []
            for index in keepIndexes:
                finalForms.append(forms[index-1])
            forms = finalForms
   
    return forms
예제 #17
0
def praca_shopping():
    from robobrowser import RoboBrowser

    browser = RoboBrowser(parser="html.parser")

    not_finded = 0
    n = 0
    names = set()
    while not_finded < 20:
        # print(f'Página {n}')
        finded = False
        url = f"http://www.pracauberabashopping.com.br/filtro_loja_tipo.asp?tipo=vlojas.asp?busca1={n}"
        browser.open(url)
        item = browser.find("strong")
        if item:
            name = item.text
            if name != "Busca sem resultado.":
                names.add(fixed(name))
                finded = True
        else:
            items = browser.find_all("a")
            if len(items) > 1:
                for item in items[1:]:
                    if item.text != "Resultado da Busca":
                        names.add(fixed(item.text))
                finded = True

        if not finded:
            not_finded += 1

        n += 1
    return names
예제 #18
0
def get_kicktipp_content(browser: RoboBrowser):
    """
    Get the content view area from the kicktipp page.
    """
    content = browser.find_all(id='kicktipp-content')
    if content[0]:
        return content[0]
    return None
예제 #19
0
def get_bracket_data(year):
    url = 'http://espn.go.com/mens-college-basketball/tournament/bracket/_/id/{}22/'.format(year)
    b = RoboBrowser()
    b.open(url)
    data = []
    for item in b.find_all(attrs={'class': 'match'}):
        t1, t2 = [(get_id(a['href']), a['title']) for a in item('a')]
        s1, s2 = ' '.join(item.find('dd').stripped_strings).split()
        data.append([t1, t2, s1, s2])
    return data
예제 #20
0
파일: luscio.py 프로젝트: Selezar211/sdfsdf
def FetchImagesForOneComic(BASE_URL):

    with open('LUSCIO_One_Comic.txt', 'w') as f:
        print(f)

    Split_URL = BASE_URL.split('/')

    Craft_Input_Url = 'https://luscious.net/c/hentai_manga/pictures/album/' + Split_URL[
        4] + '/page/'

    print(Craft_Input_Url)

    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')

    jobs = []
    counter = 1
    while True:
        index = 0
        print('Searching for number of pages')
        Working_URL = Craft_Input_Url + str(counter) + '/'
        print(Working_URL)
        browser.open(Working_URL)

        Thumbnail_Container = browser.find_all(
            'div', {'class': 'item thumbnail ic_container'})

        if (len(Thumbnail_Container) == 0):
            break

        for item in Thumbnail_Container:
            img_link = item.find('a', href=True)['href']

            p = multiprocessing.Process(target=MProcess,
                                        args=(
                                            index,
                                            counter,
                                            img_link,
                                        ))
            index += 1
            jobs.append(p)
            p.start()

        counter += 1

    for proc in jobs:
        proc.join()

    with open('LUSCIO_One_Comic.txt', 'r') as f:
        data = f.read().splitlines()

    result = (sorted(data, key=ExtractNumberFromURLforLuscio))

    return result
예제 #21
0
def get_data():
    url = 'https://domo.ayy.fi/customers/sign_in'
    br = RoboBrowser()
    br.open(url)
    form = br.get_form()
    form['customer[email]'].value = config.email
    form['customer[password]'].value = config.password
    br.submit_form(form)
    data = br.find_all("li")
    #remove 2 first li elements
    return data[2:]
예제 #22
0
def gatherData(user, password):
    baseURL = 'https://sigarra.up.pt/feup/pt/'
    browser = RoboBrowser(history=True, parser='html.parser')
    browser.open(baseURL + 'web_page.Inicial')

    # Gets the login form
    form = browser.get_form(action=re.compile(r'validacao'))

    # Updates the login form with the user credentials
    form['p_user'].value = 'up' + user
    form['p_pass'].value = password

    browser.submit_form(form)

    # Goes to the user profile
    browser.open(baseURL + 'fest_geral.cursos_list?pv_num_unico=' + user)

    # Opens the extended view
    extended = browser.find(title='Visualizar informações no contexto do curso')
    browser.follow_link(extended)

    credits = []
    grades = []

    # For each html class containing grades ("i", "p" and "o"), gather data
    for i in browser.find_all(class_='i'):
        if i.find(class_='n aprovado'):
            credits.append(i.find(class_='k n').text)
            grades.append(i.find(class_='n aprovado').text)

    for j in browser.find_all(class_='p'):
        if j.find(class_='n aprovado'):
            credits.append(j.find(class_='k n').text)
            grades.append(j.find(class_='n aprovado').text)

    for k in browser.find_all(class_='o'):
        if k.find(class_='n aprovado'):
            credits.append(k.find(class_='k n').text)
            grades.append(k.find(class_='n aprovado').text)

    return credits, grades
예제 #23
0
class BadooApi:
    def __init__(self):
        self.browser = RoboBrowser(history=True, parser='html.parser')
        self.browser.open(
            BASE_URL.format("/es/contactos/spain/zaragoza/zaragoza/"))

    def next_page(self):
        btns = self.browser.find_all(class_=re.compile(
            r".*btn.*btn--xsm.*btn--transparent.*js-pages.*"))
        try:
            print(BASE_URL.format(btns[1]['href']))
            self.browser.open(BASE_URL.format(btns[1]['href']))
        except:
            print(BASE_URL.format(btns[0]['href']))
            self.browser.open(BASE_URL.format(btns[0]['href']))

    def extract_users(self):
        profiles = []
        for a in self.browser.find_all("a", {"rel": "profile-view"}):
            profiles.append({"url": a["href"], "name": a["title"]})
        return profiles

    def get_public_profile(self, url):
        self.browser.open(BASE_URL.format(url))
        photos = []
        for img in self.browser.find_all(
                class_=re.compile(r'.*photo-list__img.*js-gallery-img.*')):
            photos.append(img['src'])
        info = self.browser.find("title").text.split("|")
        personal_info = info[0].split(",")
        name = personal_info[0]
        sex = personal_info[1]
        age = personal_info[2]
        location = info[1]
        return ({
            "name": name,
            "sex": sex,
            "age": age,
            "location": location,
            "photos": photos
        })
예제 #24
0
def ExtractBigImage(
    url='http://pururin.us/view/32338/1/kimi-wa-kanojo-no-kanrika-ni-iru.html'
):
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')
    browser.open(url)

    All_img = browser.find_all('img', src=True)

    for line in All_img:
        print(fix_bad_unicode(line))
예제 #25
0
def scrape_cosmo(url,email,password ):
    browser = RoboBrowser()
    browser.open(tlink)
    search = browser.get_form()
    search[ 'user[email]' ] = str(email)
    search[ 'user[password]' ] = str(password)
    browser.submit_form(search,submit=search.submit_fields['commit'])

    # main page
    browser.follow_link(browser.find_all('a')[2])
    # missions
    browser.follow_link(browser.find_all('a')[17])

    # find deadlines
    deadlines_tags = list(filter( lambda x:x['class']==['table-end-at'], browser.find_all('td')   ) )
    deadlines = list( map (lambda x: (list(x))[0] if list(x) else 'not yet', deadlines_tags ))


    curr_yr = datetime.now().year

    #returns a list of datetime objects
    return mapper( lambda x:  datetime.strptime( f"{curr_yr} {x}",  '%Y %d %b %H:%M') if x!='not yet' else 'Not yet', deadlines)
예제 #26
0
async def fullwidth(arg1, *args):
    print('fullwidth')
    await torgo.send_typing(channel)
    query = arg1
    for arg in args:
        query += ("+" + arg)
    print(query)
    browser = RoboBrowser()
    browser.open('http://qaz.wtf/u/convert.cgi?text=' + query)
    cells = browser.find_all('td')
    content = cells[5].text.strip()
    print(content)
    await torgo.say(str(content))
예제 #27
0
 def follower(self, count):
     tc = requests.session()
     tc.verify = False
     tbrowser = RoboBrowser(session=tc)
     tbrowser.open('https://www.tumblr.com/tagged/trending-topics')
     links = tbrowser.find_all("a", {"class": "post_info_link"})
     for link in links:
         try:
             self.client.post('user/follow', params={'url': link['href']})
             print("following " + link['href'] + "On account: " +
                   self.blog_url)
         except:
             print("boo")
예제 #28
0
def scrape_snotel_sites(url=None):
    if not url:
        url = "http://www.wcc.nrcs.usda.gov/nwcc/yearcount?network=sntl&counttype=statelist&state="
    browser = RoboBrowser(parser="html5lib")
    browser.open(url)
    browser.response.raise_for_status()
    table = browser.find_all("table")[4]
    sites = [] # list of sites with name and code
    cols = [t.text.strip() for t in table.tr.find_all("th")]
    for row in table.find_all("tr"):
        if row.td and row.td.text.strip() == 'SNTL':
            items = [i.text.strip() for i in row.find_all("td")]
            sites.append(dict(zip(cols, items)))
    return sites
예제 #29
0
 def get_people(self, link):
     session = requests.Session()
     people_list = []
     browser = RoboBrowser(session=session, user_agent=random.choice(self.HEADERS_LIST), parser="lxml")
     url = "https://twitter.com" + link
     try:
         browser.open(url)
         results = browser.find_all("a", {
             "class": "account-group js-account-group js-action-profile js-user-profile-link js-nav"})
         for link in results:
             people_list.append(str(link.get('href')).replace("/", ""))
     except:
         pass
     return people_list
예제 #30
0
def fetch_from_the_unicode_website(work_location):
    bs = RoboBrowser(history=True, parser="html.parser")
    bs.open(
        'https://web.archive.org/web/20161205225113/http://unicode.org/emoji/charts/full-emoji-list.html'
    )
    table_rows = bs.find_all("tr")
    print "Number of Rows %d" % len(table_rows)
    meta_data_dictionary = {}
    for row in table_rows[12:]:

        row_cols = row.find_all("td")
        if len(row_cols) == 0:
            continue

        unicode_name = row_cols[1].find("a").attrs["name"]

        meta_data_dictionary[unicode_name] = {
            'actual_name':
            row_cols[16].contents[0],
            'year_introduced':
            row_cols[17].contents[0][:4],
            'key_words':
            map(lambda x: x.contents[0].encode('ascii', 'ignore'),
                row_cols[18].find_all("a"))
        }

        for i in range(2, len(row_cols)):
            images = row_cols[i].find_all("img")
            if len(images) == 0:
                continue
            img = images[0]
            base64_value = img.attrs['src'].split(",")[1]
            process_base_64_file(
                "%s/%s/%s.png" %
                (work_location, get_company_name(i), unicode_name),
                base64_value)

    # Write Cross Company MetaData to file.
    with open("%s/MetaDataInfo.json" % work_location, "w") as f_meta:
        f_meta.write(json.dumps(meta_data_dictionary, sort_keys=True,
                                indent=4))

    # Convert all Images to RGB .jpg.
    for company in company_names():
        os.system("mogrify -flatten -format jpg %s/%s/*.png -quality 99" %
                  (work_location, company))
        os.system("rm %s/%s/*.png" % (work_location, company))
        os.system("mogrify -colorspace sRGB -type truecolor %s/%s/*.jpg" %
                  (work_location, company))
def get_hot_videos(Type):
    hot_videos = {}
    br = RoboBrowser(history=True, parser='lxml')
    for i in range(1, 4):
        url = 'http://91porn.com/v.php?category={}&viewtype=basic&page={}'.format(
            Type, i)
        br.open(url)

        # get every video's information
        videos = br.find_all('div', {'class': 'listchannel'})
        # get their titles and urls
        videos_dict = dict([(i.find('a').find('img')['title'],
                             i.find('a')['href']) for i in videos])
        hot_videos.update(videos_dict)
    return hot_videos
예제 #32
0
 def ituv_info_covid(self, folder="Ituverava", path=None):
     if folder not in os.listdir(path):
         os.mkdir(folder)
     browser = RoboBrowser(parser="html.parser")
     url = "http://www.ituverava.sp.gov.br/"
     browser.open(url)
     banner = browser.find_all(class_="slidehomecropimg1 wp-post-image")[0]
     img = browser.session.get(banner["src"])
     filepath = f"{folder}/ituv_info_covid.png"
     if path:
         if path[-1] == "/":
             path = path[:-1]
         filepath = f"{path}/{filepath}"
     with open(filepath, "wb") as f:
         f.write(img.content)
예제 #33
0
def ExtractImageURL(url):

    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')
    browser.open(url)

    center = browser.find_all('center')

    nested_center = center[1].find('center')

    Img_SRC = nested_center.find('img', src=True)

    print(Img_SRC['src'])

    return Img_SRC['src']
def main():
    args = docopt(__doc__, version="dailyprogrammer-dl v{}".format(__version__))

    # Configure logging
    logLevel = logging.INFO #default
    if args['--verbose']:
        logLevel = logging.DEBUG
    elif args['--quiet']:
        logLevel = logging.ERROR

    logging.basicConfig(format='%(levelname)s: %(message)s', level=logLevel)
    logging.debug(args)

    # Process command line arguments
    challengeURL = args['<challengeurl>']

    # Parse project page for title and description
    logging.info("Parsing daily challenge: {}".format(challengeURL))
    browser = RoboBrowser()
    browser.session.headers['User-Agent'] = "dailyprogrammer-dl v{} by /u/zod77".format(__version__)
    browser.open(challengeURL)
    title = browser.find('a',class_='title').string
    description = browser.find_all('div',class_="md")
    description = description[1]
    descriptionHTML = "".join(str(t) for t in description.contents) # remove outer <div>

    projectName = generateProjectName(title)

    # Init project skeleton
    logging.info("Generating project")
    projectPath = os.path.abspath(projectName)
    os.mkdir(projectPath)

    # Write out project files
    pyTemplate = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"boilerplate.txt"))
    shutil.copy(pyTemplate, os.path.join(projectPath,"{}.py".format(projectName)))

    # Generate README.md
    h = html2text.HTML2Text()
    descriptionMD = h.handle(descriptionHTML)
    readme = os.path.join(projectPath,"README.md")
    with open(readme, "w") as f:
        f.write(descriptionMD)

    return
예제 #35
0
파일: tpb_scraper.py 프로젝트: rbi13/piv
def getTorrents(search,limit=20):

	base = "http://thepiratebay.se/search/"
	opts = "/0/7/0"
	search = search.replace(' ','%20')
	
	url = base+search+opts

	browser = RoboBrowser()
	browser.open(url)

	rows = browser.find_all('tr')
	torrents = []
	for row in rows:
		title = row.find('a',{'class':'detLink'})
		if title:
			tor = {}
			tor.update( scrapeTitle(title) )
			tor.update( scrapeDetails(row) )
			torrents.append(tor)
			if len(torrents) >= limit:
				break

	return torrents
예제 #36
0
파일: tg.py 프로젝트: bingsong/codesnippet
#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from robobrowser import RoboBrowser
from requests import Session
from fake_useragent import UserAgent
import re

url = 'http://www.baidu.com'
ua = UserAgent()
keyword = 'sp68'

s = Session()
br = RoboBrowser(session=s, history=True, user_agent=ua.chrome)
br.parser = 'lxml'
br.timeout = 1
br.open(url)

form = br.get_form(action='/s')
form['wd'].value = keyword
br.submit_form(form)

print br.url
for link in br.find_all('a', href=re.compile("^http://www.baidu.com/baidu.php")):
    print link['href']

s.close()

def main():
    """This loops through every account in accounts.csv. Appending all their orders into 1 local html. 
	That html file uses css pulled from amazon.com so it looks the excat same, and all of the links work, except the ones that requre login.
	"""

    if not os.path.isfile("history.html"):
        makeHistoryFile()
    if not os.path.isfile("accounts.csv"):
        makeAccountFile()
        print "accounts.csv file made. Fill in email/passwords and run again."
        return 1

    with open("accounts.csv", "rU") as csvFile:
        reader = csv.reader(csvFile)

        for row in reader:
            email = str(row[0])
            password = str(row[1])
            update = str(row[2])

            if update.lower() == "true":
                # html5lib parser required for broken html on gameSplits
                s = requests.Session()
                s.headers[
                    "User-Agent"
                ] = "Mozilla (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7"
                browser = RoboBrowser(history=True, parser="html5lib", session=s)

                browser.open(
                    "https://www.amazon.com/ap/signin?_encoding=UTF8&openid.assoc_handle=usflex&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.com%2F%3Fref_%3Dnav_ya_signin"
                )

                form_signIn = browser.get_forms()[0]
                form_signIn["email"] = email
                form_signIn["password"] = password

                browser.submit_form(form_signIn)

                browser.open(
                    "https://www.amazon.com/gp/css/history/orders/view.html?orderFilter=year-%s&startAtIndex=1000"
                )

                orders = browser.find_all(class_="a-box-group a-spacing-base order")

                with open(r"./history.html", "a+") as historyFile:
                    historyFile.seek(0)
                    storedOrderIds = []
                    tempOrder = ""
                    storeLine = False
                    print "Collected orders from history.html"
                    for line in historyFile:
                        if line == "<!-- Start Order -->\n":
                            storeLine = True
                            continue
                        if line == "<!-- End Order -->\n":
                            storedOrderIds.append(getOrderId(cStringIO.StringIO(tempOrder)))
                            tempOrder = ""
                            storeLine = False
                        if storeLine:
                            tempOrder += line

                    print "Orders stored", len(storedOrderIds)
                    print "Find/Adding new orders for", email
                    for order in orders:
                        orderId = getOrderId(cStringIO.StringIO(order.__str__()))
                        if not orderId in storedOrderIds:
                            print "adding order", orderId
                            historyFile.write("\n<!-- Start Order -->\n")
                            historyFile.write(getAccountHtml(email))
                            historyFile.write(order.__str__())
                            historyFile.write("\n<!-- End Order -->\n")

    print "Done"
# Submit form
browser.session.headers['Referer'] = url
signin_form.serialize() 
browser.submit_form(signin_form)

url = 'https://bitbucket.org/dashboard/pullrequests?section=teams'
browser.open(url)
links = browser.select('tr.iterable-item')
for link in links:
	print "Repository: " + link.select('td.repo')[0].text.encode("utf-8").strip()
	print "User: "******"utf-8").strip()
	print "Title: " + link.select('td.title')[0].select('a.execute')[0].text.encode("utf-8").strip()
	print "Updated " + link.select('td.date')[0].text.encode("utf-8").strip()
	print "\n----------------------"
#obtain links with beautifulSoup
links = browser.find_all('a')
for link in links:
	try:
		#print(link.get('href'))
		if not link['href'].startswith("https"):
			link['href']='https://bitbucket.org'+link['href'].encode("utf-8").strip()
			#link['href']='/odigeoteam/frontend-html5'
		print link['href']
		#print link
		browser.follow_link(link)
	
		branches = browser.select('li.branches')
		if len(branches)>0 :
			print 'branches '+ branches[0].select('span.value')[0].text
	
		tags = browser.select('li.tags')
예제 #39
0
#coding: utf-8
import re
from robobrowser import RoboBrowser

url = 'http://itest.info/courses/2'
b = RoboBrowser(history=True)
b.open(url)

#页面上所有的a
all_links = b.find_all('a')  
for link in all_links:
  print link.text

# 页面上所有class是container的div
divs = b.find_all(class_='container')
print divs

# limit 参数控制返回的元素个数

# 页面上前2个p
first_two_p = b.find_all('p', limit=2)
print first_two_p

# 如果第1个参数是列表则返回相匹配的集合

# 页面上所有的meta和title
print b.find_all(['meta', 'img'])



예제 #40
0
form["username"] = args.username
form["password"] = args.password
browser.session.headers['Referer'] = args.course
browser.submit_form(form)
# Get course name (no special characters)
courseTitle = browser.find("title").text
courseTitle = remove_prefix(courseTitle, 'Course Modules: ')
courseTitle = "".join([x if x.isalnum() else "_" for x in courseTitle])
print('Course Url: ' + courseModulesUrl)
print('Course Title: ' + courseTitle)
print('Finding file links of type: ' + args.downloadOnly)
# Make output dir
outputDir = os.path.join('output/', courseTitle)
make_path(outputDir)
# Get modules links with lecture in title
moduleLinks = browser.find_all("a", { "class" : "for-nvda" })

print('Found ' + str(len(moduleLinks)) + ' links, (not all will be valid)')

# Process each lecture link
for moduleLink in moduleLinks:
    print('Opening: ' + moduleLink['aria-label'])
    browser.follow_link(moduleLink)
    try:
        # Find link - containing words "download"
        downloadLinkRel = browser.find('a', href = re.compile(r'.*download*'))
        # If failed, find link - containing reference to file "****.XXX"
        if downloadLinkRel is None: 
            downloadLinkRel = browser.find('a', href = re.compile(r'.*\.[a-z]{3,4}$'))
        fileNameWithExtension = downloadLinkRel.text.strip()
        # Check the link is the right filetype
예제 #41
0
class ESPN_Scrape:
    def __init__(self):
        self.logged_in = False
        self.espn_header = {'1/0': 'H/AB'}
        self.br = RoboBrowser(history=True)

    def loginToESPN(self, leagueID, year):
        if not self.logged_in:
            link = 'http://games.espn.go.com/flb/leagueoffice?leagueId=' + str(leagueID) + '&seasonId=' + str(year)
            self.br = RoboBrowser(history=True)
            self.br.open(link)
            try:
                form = self.br.get_form(action="https://r.espn.go.com/espn/memberservices/pc/login")
                username = input('ESPN Username: \n')
                password = input('ESPN Password: \n')
                form['username'].value = username
                form['password'].value = password
                self.br.submit_form(form)
                self.logged_in = True
                print('\nLogging In\n')
            except:
                print('\nLogin FailedS!\n')


    def is_number(self, s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def nameToBatPos(self, d):
        # BatPos = ['Catcher', 'First Base', 'Second Base', 'Third Base', 'Shortstop', 'Left Field', 'Center Field', 'Right Field', 'Designated Hitter']
        s = d.text.format('ascii')
        name = self.getPlayerName(s)
        s = s[s.find(',') + 2:]
        pID = self.getPlayerID(d)
        team = s[:s.find('\xa0')]
        pos = s[s.find('\xa0') + 1:]
        posOut = self.getBatPositions(pos)
        return [pID, name, team] + posOut

    def nameToPlayer(self, d):
        s = d.text.format('ascii')
        name = self.getPlayerName(s)
        s = s[s.find(',') + 2:]
        pID = self.getPlayerID(d)
        team = self.getPlayerTeam(s)
        return [pID, name, team]

    def getPlayerName(self, s):
        return s[:s.find(',')]


    def getPlayerID(self, d):
        return d.find_all('a')[0]['playerid']

    def getPlayerTeam(self, s):
        return s[:s.find('\xa0')]


    def getBatPositions(self, s):
        posOut = [None] * 9
        if 'SSPD' in s:
            s = s.replace('SSPD', '')
        if '1B' in s:
            posOut[1] = 1
            s = s.replace('1B', '')
        if '2B' in s:
            posOut[2] = 1
            s = s.replace('2B', '')
        if '3B' in s:
            posOut[3] = 1
            s = s.replace('3B', '')
        if 'SS' in s:
            posOut[4] = 1
            s = s.replace('SS', '')
        if 'LF' in s:
            posOut[5] = 1
            s = s.replace('LF', '')
        if 'CF' in s:
            posOut[6] = 1
            s = s.replace('CF', '')
        if 'RF' in s:
            posOut[7] = 1
            s = s.replace('RF', '')
        if 'DH' in s:
            posOut[8] = 1
            s = s.replace('DH', '')
        if 'C' in s:
            posOut[0] = 1
            s = s.replace('C', '')
        return posOut

    def splitHAB(self, s):
        hits = s[:s.find('/')]
        ab = s[s.find('/') + 1:]
        if self.is_number(hits):
            hits = float(hits)
        else:
            hits = 0
        if self.is_number(ab):
            ab = float(ab)
        else:
            ab = 0
        return [hits, ab]

    def nameToPitchPos(self, d):
        # ['Starting Pitcher', 'Relief Pitcher']
        s = d.text.format('ascii')
        name = s[:s.find(',')]
        s = str(s[s.find(',') + 2:])
        pID = d.find_all('a')[0]['playerid']
        team = s[:s.find('\xa0')]
        pos = s[s.find('\xa0') + 1:]
        posOut = self.getPitchPositions(pos)
        return [pID, name, team] + posOut

    def getPitchPositions(self, s):
        posOut = [None] * 2
        if 'SSPD' in s:
            s = s.replace('SSPD', '')
        if 'SP' in s:
            posOut[0] = 1
            s = s.replace('SP', '')
        if 'RP' in s:
            posOut[1] = 1
            s = s.replace('RP', '')
        return posOut

    def tableToBatters(self, table):
        Hitters = pd.DataFrame()
        rows = table.find_all('tr')
        rows = rows[2:]
        for r in rows:
            data = r.find_all('td')
            data = [data[0]] + data[8:20]
            row_data = []
            for i, d in enumerate(data):
                if i == 0:
                    row_data = self.nameToBatPos(d)
                elif '/' in d.text:
                    row_data += self.splitHAB(d.text)
                else:
                    if self.is_number(d.text):
                        row_data.append(float(d.text))
                    else:
                        row_data.append(0)
            Hitters = Hitters.append(pd.Series(row_data), ignore_index=True)
        return Hitters

    def tableToPitchers(self, table):
        Pitchers = pd.DataFrame()
        rows = table.find_all('tr')
        rows = rows[2:]
        for r in rows:
            data = r.find_all('td')
            data = [data[0]] + data[8:24]
            row_data = []
            for i, d in enumerate(data):
                if i == 0:
                    row_data = self.nameToPitchPos(d)
                else:
                    if self.is_number(d.text):
                        row_data.append(float(d.text))
                    else:
                        row_data.append(0)
            Pitchers = Pitchers.append(pd.Series(row_data), ignore_index=True)
        return Pitchers


    def scrapePlayerProjections(self, leagueID, year):
        self.loginToESPN(leagueID, year)
        Hitters = pd.DataFrame()
        HitPos = ['Catcher', 'First Base', 'Second Base', 'Third Base', 'Shortstop', 'Left Field', 'Center Field',
                  'Right Field', 'Designated Hitter']
        Pitchers = pd.DataFrame()
        PitchPos = ['Starting Pitcher', 'Relief Pitcher']
        thead = []
        index = 0
        # get batter values
        self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
            year) + '&context=freeagency&view=stats&version=projections&startIndex=0&avail=-1&startIndex=' + str(index))
        table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
        rows = table.find_all('tr')

        # get the column headers
        header = rows[1]
        data = header.find_all('td')
        data = [data[0]] + data[8:20]
        for d in data:
            txt = d.text.replace('\xa0', '')
            thead.append(txt.format('ascii'))
        thead[0] = 'PlayerId'
        if 'H/AB' in thead:
            ind = thead.index('H/AB')
            thead[ind] = 'AB'  # AB stored in ind+1
            thead.insert(ind, 'H')  # H stored in ind
        thead.insert(1, 'Team')
        thead.insert(1, 'Name')
        thead = thead[0:3] + HitPos + thead[3:]
        # get player projections
        while index < 250:
            self.br.open(
                'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
                    year) + '&context=freeagency&view=stats&version=projections&avail=-1&startIndex=' + str(index))
            table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
            Hitters = Hitters.append(self.tableToBatters(table), ignore_index=True)
            index += 50
        Hitters.columns = thead
        index = 0


        # get Pitchers
        self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
            year) + '&context=freeagency&view=stats&version=projections&avail=-1&slotCategoryGroup=2&startIndex=' + str(
            index))
        table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
        rows = table.find_all('tr')

        # get the column headers
        thead = []
        header = rows[1]
        data = header.find_all('td')
        data = [data[0]] + data[8:24]
        for d in data:
            txt = d.text.replace('\xa0', '')
            thead.append(txt.format('ascii'))
        thead[0] = 'PlayerId'
        thead.insert(1, 'Team')
        thead.insert(1, 'Name')
        thead = thead[0:3] + PitchPos + thead[3:]
        #get player projections
        while index < 250:
            self.br.open(
                'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
                    year) + '&context=freeagency&view=stats&version=projections&avail=-1&slotCategoryGroup=2&startIndex=' + str(
                    index))
            table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
            Pitchers = Pitchers.append(self.tableToPitchers(table), ignore_index=True)
            index += 50
        Pitchers.columns = thead

        return Hitters, Pitchers


    def scrapePlayerSeason(self, leagueID, year):
        self.loginToESPN(leagueID, year)
        Hitters = pd.DataFrame()
        HitPos = ['Catcher', 'First Base', 'Second Base', 'Third Base', 'Shortstop', 'Left Field', 'Center Field',
                  'Right Field', 'Designated Hitter']
        Pitchers = pd.DataFrame()
        PitchPos = ['Starting Pitcher', 'Relief Pitcher']
        thead = []
        index = 0
        # get batter values
        self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
            year) + '&context=freeagency&view=stats&version=currSeason&startIndex=0&avail=-1&startIndex=' + str(index))
        table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
        rows = table.find_all('tr')

        # get the column headers
        header = rows[1]
        data = header.find_all('td')
        data = [data[0]] + data[8:20]
        for d in data:
            txt = d.text.replace('\xa0', '')
            thead.append(txt.format('ascii'))
        thead[0] = 'PlayerId'
        if 'H/AB' in thead:
            ind = thead.index('H/AB')
            thead[ind] = 'AB'  # AB stored in ind+1
            thead.insert(ind, 'H')  # H stored in ind
        thead.insert(1, 'Team')
        thead.insert(1, 'Name')
        thead = thead[0:3] + HitPos + thead[3:]
        # get player projections
        while index < 250:
            self.br.open(
                'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
                    year) + '&context=freeagency&view=stats&version=currSeason&avail=-1&startIndex=' + str(index))
            table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
            Hitters = Hitters.append(self.tableToBatters(table), ignore_index=True)
            index += 50
        Hitters.columns = thead
        index = 0


        # get Pitchers
        self.br.open('http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
            year) + '&context=freeagency&view=stats&version=currSeason&avail=-1&slotCategoryGroup=2&startIndex=' + str(
            index))
        table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
        rows = table.find_all('tr')

        # get the column headers
        thead = []
        header = rows[1]
        data = header.find_all('td')
        data = [data[0]] + data[8:24]
        for d in data:
            txt = d.text.replace('\xa0', '')
            thead.append(txt.format('ascii'))
        thead[0] = 'PlayerId'
        thead.insert(1, 'Team')
        thead.insert(1, 'Name')
        thead = thead[0:3] + PitchPos + thead[3:]
        # get player projections
        while index < 250:
            self.br.open(
                'http://games.espn.go.com/flb/freeagency?leagueId=' + str(leagueID) + '&teamId=1&seasonId=' + str(
                    year) + '&context=freeagency&view=stats&version=currSeason&avail=-1&slotCategoryGroup=2&startIndex=' + str(
                    index))
            table = self.br.find_all('table', class_='playerTableTable tableBody')[0]
            Pitchers = Pitchers.append(self.tableToPitchers(table), ignore_index=True)
            index += 50
        Pitchers.columns = thead

        return Hitters, Pitchers


    def scrapeTeamPlayers(self, leagueID, year, teams):
        self.loginToESPN(leagueID, year)

        teamBatters = pd.DataFrame()
        teamPitchers = pd.DataFrame()

        urls = list(teams['Link'])
        for u in urls:
            self.br.open('http://games.espn.go.com' + u)
            teamId = teams[teams['Link'] == u].iloc[0]['teamId']
            # batters
            Btable = self.br.find_all('table', class_='playerTableTable tableBody')[0]
            rows = Btable.find_all('tr')
            rows = rows[2:]
            for r in rows:
                d = r.find_all('td')[1]
                if d.find_all('a'):
                    pID = int(self.getPlayerID(d))
                    teamBatters = teamBatters.append(pd.Series([teamId, pID]), ignore_index=True)



            # pitchers
            Ptable = self.br.find_all('table', class_="playerTableTable tableBody playerTableMoreTable")[0]
            rows = Ptable.find_all('tr')
            rows = rows[2:]
            for r in rows:
                d = r.find_all('td')[1]
                if d.find_all('a'):
                    pID = int(self.getPlayerID(d))
                    teamPitchers = teamPitchers.append(pd.Series([teamId, pID]), ignore_index=True)

        teamBatters.columns = ['teamId', 'playerId']
        teamPitchers.columns = ['teamId', 'playerId']
        return teamBatters, teamPitchers


    # data frame containing all of te results for each weeks matchups
    # [weekID, gameID, teamID, H, R, 2B, 3B, HR, XBH, RBI, BB, SB, AVG, OBP, SLG,
    # K, QS, CG, SO, W, L, SV, HD, BAA, ERA, WHIP, K/9, Wins, Losses, Ties, H/A]

    def scrapeMatchupResults(self, leagueId, year):
        matchups = pd.DataFrame()
        week = self.currentWeek()
        weeks = [i for i in range(1, week + 1)]
        for w in weeks:
            matchups = matchups.append(self.scrapeMatchUpWeek(leagueId, year, w), ignore_index=True)
        return matchups


    # data frame containing all of the results for one weeks matchups
    # [weekID, gameID, teamID, H, R, 2B, 3B, HR, XBH, RBI, BB, SB, AVG, OBP, SLG,
    # K, QS, CG, SO, W, L, SV, HD, BAA, ERA, WHIP, K/9, Wins, Losses, Ties, H/A]
    def scrapeMatchUpWeek(self, leagueId, year, weekId):
        matchupWeek = pd.DataFrame()
        self.loginToESPN(leagueId, year)
        link = 'http://games.espn.go.com/flb/scoreboard?leagueId=' + str(leagueId) + '&seasonId=' + str(
            year) + '&matchupPeriodId=' + str(weekId)
        self.br.open(link)
        table = self.br.find_all('table', class_='tableBody')
        table = table[0]
        rows = table.find_all('tr')
        head = rows[1].find_all('th')
        header = [h.text for h in head]
        while '' in header:
            header.remove('')
        header = header[1:-1]
        header.insert(0, 'Name')
        header.insert(0, 'teamId')
        header.insert(0, 'gameId')
        header.insert(0, 'weekId')
        header.append('Wins')
        header.append('Losses')
        header.append('Ties')
        header.append('H/A')
        stats = rows[2:]
        count = 0
        for r in stats:
            data_row = []
            teamRow = r.find_all('td', class_='teamName')
            if teamRow:
                name = self.teamNameToRow(teamRow[0])
                data = r.find_all('td')
                for d in data:
                    if self.is_number(d.text):
                        data_row.append(float(d.text))
                score = self.scoreToList(data[-1].text)
                out = [weekId, 6 * (weekId - 1) + math.floor(count / 2)] + name[:2] + data_row + score + [count % 2]
                matchupWeek = matchupWeek.append(pd.Series(out), ignore_index=True)
                count += 1
        matchupWeek.columns = header
        return matchupWeek


    def scoreToList(self, s):
        wins = float(s[:s.find('-')])
        s = s[s.find('-') + 1:]
        losses = float(s[:s.find('-')])
        ties = float(s[s.find('-') + 1:])
        return [wins, losses, ties]


    # takes current date and find the current week
    def currentWeek(self):
        weekIds = pd.read_csv('Data/weekId.csv', index_col=0)
        now = datetime.datetime.now()
        weekEnds = list(weekIds['end'])
        for i, w in enumerate(weekEnds):
            dt = datetime.datetime.strptime(w, '%m/%d/%y')
            if dt > now:
                return i + 1
        return i + 1


    # data frame containing all of the matchups
    # [weekID, gameID, teamID, H/A]
    def scrapeLeagueSchedule(self, leagueId, year):
        schedule = pd.DataFrame()
        self.loginToESPN(leagueId, year)
        weekId = 0
        gameId = 0
        while weekId < 22:
            link = 'http://games.espn.go.com/flb/scoreboard?leagueId=' + str(leagueId) + '&seasonId=' + str(
                year) + '&matchupPeriodId=' + str(weekId)
            self.br.open(link)
            table = self.br.find_all('table', class_='tableBody')
            table = table[0]
            rows = table.find_all('tr')
            count = 0
            for r in rows:
                data = r.find_all('td', class_='teamName')
                for d in data:
                    name_row = self.teamNameToRow(d)
                    homeAway = count % 2
                    schedule = schedule.append(pd.Series([weekId, gameId, name_row[0], homeAway]), ignore_index=True)
                    count += 1
                    if count % 2 == 0:
                        gameId += 1
            weekId += 1
        schedule.columns = ['weekId', 'gameId', 'teamId', 'H/A']
        return schedule


    # return all matchups so far
    def scrapeMatchupPlayers(self, leagueId, year):
        batters = pd.DataFrame()
        pitchers = pd.DataFrame()
        week = self.currentWeek() - 1

        weeks = [i for i in range(1, week + 1)]
        for w in weeks:
            B, P = self.scrapeMatchupPlayersWeek(leagueId, year, w)
            batters = batters.append(B, ignore_index=True)
            pitchers = pitchers.append(P, ignore_index=True)
        return batters, pitchers

    # data frame containing player results for each matchup
    # both hitters and pitchers and their catagories
    def scrapeMatchupPlayersWeek(self, leagueId, year, week):
        matchupBatters = pd.DataFrame()
        matchupPitchers = pd.DataFrame()
        link = 'http://games.espn.go.com/flb/scoreboard?leagueId=' + str(leagueId) + '&seasonId=' + str(
            year) + '&matchupPeriodId=' + str(week)

        base = 'http://games.espn.go.com'
        self.loginToESPN(leagueId, year)
        self.br.open(link)
        links = self.br.find_all('a')
        bscores = []
        for l in links:
            if l.text == 'Full Box Score':
                bscores.append(base + l['href'])
        for bs in bscores:
            self.br.open(bs)
            tables = self.br.find_all('table', class_="playerTableTable tableBody")
            for i, t in enumerate(tables):
                if i % 2:  # Pitchers
                    matchupPitchers = matchupPitchers.append(self.scrapeMatchupPitchers(t), ignore_index=True)

                else:  # Batters

                    matchupBatters = matchupBatters.append(self.scrapeMatchupBatters(t), ignore_index=True)

        matchupBatters['weekId'] = week
        matchupPitchers['weekId'] = week
        return matchupBatters, matchupPitchers

    def scrapeMatchupBatters(self, table):
        batters = pd.DataFrame()
        rows = table.find_all('tr')
        head = rows[2].find_all('td')
        header = [h.text for h in head]
        header = header[2:]
        header[0] = 'PlayerId'
        if 'H/AB' in header:
            ind = header.index('H/AB')
            header[ind] = 'AB'  # AB stored in ind+1
            header.insert(ind, 'H')  # H stored in ind
        header.insert(1, 'Team')
        header.insert(1, 'Name')
        rows = rows[3:-1]
        for r in rows:
            data_row = r.find_all('td')
            data_row = [data_row[0]] + data_row[3:]
            row_data = []
            for i, d in enumerate(data_row):
                if i == 0:
                    row_data = self.nameToPlayer(d)
                elif '/' in d.text:
                    row_data += self.splitHAB(d.text)
                else:
                    if self.is_number(d.text):
                        row_data.append(float(d.text))
                    else:
                        row_data.append(0)
            batters = batters.append(pd.Series(row_data), ignore_index=True)
        batters.columns = header
        return batters

    def scrapeMatchupPitchers(self, table):
        pitchers = pd.DataFrame()
        rows = table.find_all('tr')
        head = rows[1].find_all('td')
        header = [h.text for h in head]
        header = header[2:]
        header[0] = 'PlayerId'
        header.insert(1, 'Team')
        header.insert(1, 'Name')
        rows = rows[3:-1]
        for r in rows:
            data_row = r.find_all('td')
            data_row = [data_row[0]] + data_row[3:]
            row_data = []
            for i, d in enumerate(data_row):
                if i == 0:
                    row_data = self.nameToPlayer(d)
                else:
                    if d.text == 'INF':
                        row_data.append(13.5)
                    elif self.is_number(d.text):
                        row_data.append(float(d.text))
                    else:
                        row_data.append(0)
            pitchers = pitchers.append(pd.Series(row_data), ignore_index=True)
        pitchers.columns = header
        return pitchers

    # returns data frame containing
    # [teamID, teamName, shortName, wins, losses, draws]
    def scrapeLeagueTeams(self, leagueId, year):
        self.loginToESPN(leagueId, year)

        # dataframe will have the following columns:
        # [teamID, teamName, wins, losses, draws]
        teams = pd.DataFrame()

        self.br.open('http://games.espn.go.com/flb/standings?leagueId=' + str(leagueId) + '&seasonId=' + str(year))
        tables = self.br.find_all('table', class_='tableBody')
        tables = tables[:-1]
        for t in tables:
            row = t.find_all('tr')[2:]
            for r in row:
                data = r.find_all('td')
                name = data[0]
                name_row = self.teamNameToRow(name)
                wins = float(data[1].text)
                losses = float(data[2].text)
                draw = float(data[3].text)
                out = name_row + [wins, losses, draw]
                teams = teams.append(pd.Series(out), ignore_index=True)
        teams.columns = ['teamId', 'Name', 'Link', 'W', 'L', 'T']
        return teams


    def teamNameToRow(self, name):
        link = name.find_all('a')[0]['href']
        ID = link.split('&')[1]
        teamID = int(ID[ID.find('=') + 1:])
        teamName = name.text
        if teamName.find(' (') != -1:
            teamName = teamName[:teamName.find(' (')]

        return [teamID, teamName, link]


    def scrapeTeamStats(self, leagueID, year):
        self.loginToESPN(leagueID, year)

        # dataframe will have the following columns:
        # [teamID, teamName, wins, losses, draws]
        teamStats = pd.DataFrame()

        self.br.open('http://games.espn.go.com/flb/standings?leagueId=' + str(leagueID) + '&seasonId=' + str(year))
        tables = self.br.find_all('table', class_='tableBody')
        table = tables[-1]
        rows = table.find_all('tr')
        head = rows[2].find_all('td')
        header = [h.text for h in head]
        while '' in header:
            header.remove('')
        header.insert(0, 'Name')
        header.insert(0, 'teamId')
        stats = rows[3:]

        for r in stats:
            data_row = []
            data = r.find_all('td')
            name = self.teamNameToRow(data[1])
            data = data[2:-2]
            for d in data:
                if self.is_number(d.text):
                    data_row.append(float(d.text))
            out = name[:2] + data_row

            teamStats = teamStats.append(pd.Series(out), ignore_index=True)
        teamStats.columns = header
        return teamStats
예제 #42
0
challenge_count = 1

while (True):
    browser = RoboBrowser(parser='lxml')
    browser.open(SITE_URL)

    # loop forever
    #try catch this
    signin_form = browser.get_forms()[0]
    signin_form['login'].value = username
    signin_form['password'].value = password
    browser.submit_form(signin_form)

    #get the leaderboard list
    browser.follow_link(browser.get_link(text='Leaderboard'))
    bot_name_tags = browser.find_all('div', {'class': 'bot-name'});
    bot_name_extracter = lambda tag: tag.string.replace('\t', '').replace('\n', '').lower()
    bot_names = map(bot_name_extracter, bot_name_tags)
    no_bots = len(bot_names)

    our_rank = bot_names.index('cbteamname') + 1
    print("[INFO] CBTeamName is ranked " + str(our_rank))

    random.seed(os.urandom(8))
    opponent_queue = []
    #three bots with lower rank
    opponent_queue += ([bot_names[random.randint(our_rank + 1, no_bots - 1)],
                        bot_names[random.randint(our_rank + 1, no_bots - 1)],
                        bot_names[random.randint(our_rank + 1, no_bots - 1)]])
    #one bot with a higher rank
    opponent_queue += ([bot_names[random.randint(0, our_rank - 1)]])
예제 #43
0
     br.open(url)
     fp = br.parsed
     #f0 = open('f1.html', 'w')
     #f0.write(str(fp))
     
     #login
     form=br.get_form(id='mod_loginform')
     form['username'].value= 'pygather'
     form['passwd'].value= '1324354657687980'
     br.submit_form(form)
     sp = br.parsed
     #f2 = open('f2.html','w')
     #f2.write(str(sp))
 
     #navigate to quick submit
     for a in br.find_all('a', href=True, text = re.compile('Quick Submit')):
         br.follow_link(a)
     tp = br.parsed
 
 
 
     form = br.get_form(action = re.compile('Itemid=25'))
     # print(form)
     #form.new_control('text','code',{'value':''})
     #form.fixup()
     form['localid'].value=str(curProgram)
     form['language'].value='2'
     form['code'].value='import java.util.*;class Main{public static void main(String[]args) throws Exception{Scanner in = new Scanner(System.in);StringBuilder sb = new StringBuilder();while(in.hasNextLine()){sb.append(in.nextLine());}byte b=(byte)sb.charAt('+str(curByte)+');if((b>>'+str(shift)+'&0x01)==0){throw new Exception("Error");}}}'
     br.submit_form(form)
     #f3 = open('f3.html','w')
     #f3.write(str(tp))
예제 #44
0
파일: c.py 프로젝트: L3Sota/atcoder
# pip3 install robobrowser
from robobrowser import RoboBrowser

browser = RoboBrowser()
browser.open('https://cdn.hackerrank.com/hackerrank/static/contests/capture-the-flag/infinite/qds.html')
history = ['https://cdn.hackerrank.com/hackerrank/static/contests/capture-the-flag/infinite/qds.html']
font = browser.find_all('font')
links = browser.get_links()

while len(font) < 50000 and len(links) > 0:
    for link in links:
        if link not in history:
            browser.follow_link(link)
            font.extend(browser.find_all())
            links.extend(browser.get_links())
            history.append(link)
        links.remove(link)

print(font)
예제 #45
0
path = os.path.join(path,'暴走GIF')
if not os.path.exists(path):
    os.mkdir(path)                                  #创建文件夹

url = "http://baozoumanhua.com/gif/month/page/"     #url地址
headers = {                                         #伪装浏览器
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
                 ' Chrome/32.0.1700.76 Safari/537.36'
}


browser = RoboBrowser(history=True,user_agent='Mozilla/5.0 ... Safari/537.36')

for count in range(page_sum):
    try:
        browser.open(url+str(count+1),method='get',headers=headers)
    except Exception:
        continue

    if browser.response.status_code is not 200:
        continue
    else:
        img_content = browser.find_all('img',attrs={'style':'width:460px'})
        url_list = [img['src'] for img in img_content]      #列表推导 url
        title_list = [img['alt'] for img in img_content]    #图片名称
        print("count:"+str(count))
        for i in range(url_list.__len__()) :
            imgurl = url_list[i]
            filename = path.decode('utf-8') + os.sep.decode('utf-8') + title_list[i] + ".gif"
            print(filename+":"+imgurl)                         #打印下载信息
            urllib.urlretrieve(imgurl,filename)                #下载图片
예제 #46
0
# Browser
#br = mechanize.Browser()
br = RoboBrowser(history=True, user_agent='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2')


# The site we will navigate into, handling it's session
br.open('http://heroes-wow.com/wotlk/index.php?page=login')

login_form = br.get_form(action="http://heroes-wow.com/wotlk/execute.php?take=login")
login_form['username'].value = 'anathk2'
login_form['password'].value = 'wow123456'
login_form['rememberme'].value = '1'

br.submit_form(login_form)


br.open('http://topg.org/server-heroes-wow-id347987')
links = br.find_all('a', href=True)
br.follow_link(links[22])
result = br.parsed

new_links = br.find_all('a', href=True)
br.follow_link(new_links[1])






예제 #47
0
def parseWeek(year, week):
    """Parsing a specific week at http://nflweather.com/week/{}/Week-{}
    Follows all detial links, which is where must of the data is scraped.
    Scrapes weather, and stadium enough per week, and stores them in their respective collections
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/')

    startTime = datetime.now()

    logger.debug('Starting %d %d', year, week)

    weather_list = []
    stadium_list = []

    if col_weather_info.find({'year': year, 'week': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week))

    data = browser.find(class_="footable")
    rows = data.find_all('tr')

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        weatherInfo = {'year': year, 'week': week}
        stadiumInfo = {'year': year, 'week': week}

        try:
            columns = row.find_all('td')
            if columns:
                weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt']
                weatherInfo['weatherText'] = columns[9].text.strip()
                weatherInfo['shortWind'] = columns[10].text
                details = columns[12]
                detialsLink = 'http://nflweather.com' + details.find('a')['href']
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                browser = open_or_follow_link(logger, browser, 'open', detialsLink)
                gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip()
                awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace('  ', ' ').strip()
                homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace('  ', ' ').strip()
                spans = browser.find_all(class_='span5')
                if len(spans) != 2:
                    raise Exception('to many spans')

                weatherItems = spans[0].find_all('p')
                stadiumItems = spans[1].find_all('p')

                index = spans[0].text.find('Temperature:')
                weatherCondition = spans[0].text[:index].strip()

                for each in weatherItems:
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())
                
                for index, each in enumerate(stadiumItems):
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        if split[0] == 'Surface':
                            stadiumInfo['stadium'] = stadiumItems[index-1].text.strip()
                        stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())

                #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list
                schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam}
                schedule_doc = col_schedule.find(schedule_query)
                if schedule_doc.count() != 1:
                    error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo)
                    raise Exception("nfl_scedule doc not found " + error_docs)
                result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}})
                schedule_id = schedule_doc[0]['_id']
                weatherInfo['schedule_id'] = schedule_id
                stadiumInfo['schedule_id'] = schedule_id
                weather_list.append(weatherInfo)
                stadium_list.append(stadiumInfo)
        except:
            logger.exception(row)

    try:
        logger.debug('Bulk Creating weather_list')
        col_weather_info.insert_many(weather_list)
        logger.debug('Bulk Creating stadium_list')
        col_stadium_info.insert_many(stadium_list)
    except:
        logger.exception('insert_many error')
    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
예제 #48
0
def spider(update=False, daysago=30, name=None, path=_DIR_WISEREP, include_type=[], force=False):
    start_time = time.time()

    incl_type_str = 'supernovae' if not include_type else '-'.join(
        include_type)

    if not os.path.exists(_PATH + path):
        os.mkdir(_PATH + path)

    # dig up lists of known non-supernovae and completed events, or create if
    # it does not exist
    if os.path.exists(_PATH + path + 'lists.json'):
        with open(_PATH + path + 'lists.json', 'r') as json_in:
            list_dict = json.load(json_in)
    else:
        list_dict = {'non_SN': [], 'completed': []}

        with open(_PATH + path + 'lists.json', 'w') as fp:
            json.dump(list_dict, fp, indent=4)

    # collect metadata for the few available host spectra and
    # build a dictionary that will be used below to
    # remove by SNname and "Spectrum Type"
    obj_host_dict = {}
    if daysago:
        browser = RoboBrowser(history=False, parser='lxml')
        browser.open(_WISEREP_SPECTRA_URL)
        form = browser.get_form(action='/spectra/list')
        form['spectypeid'] = "2"  # 2 for Host spectrum
        form['rowslimit'] = "10000"
        browser.submit_form(form)
        print('\tHost page received')

        obj_host_headers = (browser.find("tr", {"style": "font-weight:bold"})
                            .findChildren("td"))

        for i, header in enumerate(obj_host_headers):
            # if header.text == 'Obj. Name':
            #     host_obj_name_idx = i
            if header.text == 'Spec.Program':
                host_program_idx = i
            if header.text == 'Instrument':
                host_instrument_idx = i
            if header.text == 'Observer':
                host_observer_idx = i
            if header.text == 'Obs. Date':
                host_obsdate_idx = i
            if header.text == 'Reducer':
                host_reducer_idx = i
            if header.text == 'Ascii FileFits  File':
                host_filename_idx = i

        obj_host_list = browser.find_all(
            "a", {"title": "Click to show/update object"})

        for i, obj in enumerate(obj_host_list):
            print('\tParsing', i + 1, 'of', len(obj_host_list), 'host spectra')
            obj_name = obj.text
            host_children = obj.parent.parent.findChildren("td")
            host_program = host_children[host_program_idx].text
            host_instrument = host_children[host_instrument_idx].text
            host_observer = host_children[host_observer_idx].text
            host_obsdate = host_children[host_obsdate_idx].text
            host_reducer = host_children[host_reducer_idx].text
            host_filename = host_children[host_filename_idx].text
            host_filename = host_filename.strip().split('\n')[0]

            obj_host_dict[obj_name] = OrderedDict([
                ("Type", "Host spectrum"),
                ("Filename", host_filename),
                ("Obs. Date", host_obsdate),
                ("Program", host_program),
                ("Instrument", host_instrument),
                ("Observer", host_observer),
                ("Reducer", host_reducer),
            ])

    # begin scraping WISeREP OBJECTS page for supernovae
    browser = RoboBrowser(history=False, parser='lxml')
    browser.open(_WISEREP_OBJECTS_URL)
    form = browser.get_form(action='/objects/list')

    # ready search form with field entries to submit, depending on --update
    if browser and update:
        if daysago:
            daysstr = str(daysago)
            # set "Added within the last args.daysago days"
            print('Collecting new spectra from the last', daysstr, 'days')

            form['daysago'] = daysstr
        if name:
            form['name'] = name
        form['rowslimit'] = "10000"
        browser.submit_form(form)

        try:
            new_objs = (browser.find("tr", {"style": "font-weight:bold"})
                        .parent.findChildren("tr", {"valign": "top"}))
        except AttributeError:
            if daysago:
                print('Nothing to collect since ' + daysstr + ' days ago')
            else:
                print('Nothing to collect!')
            return

        new_objs = (browser.find("tr", {"style": "font-weight:bold"})
                    .parent.findChildren("tr", {"valign": "top"}))

        SN_list_tags = []

        for obj in new_objs:
            obj_name_tag = obj.find("a", {"title": "Click to show/update"})
            SN_list_tags.append(obj_name_tag)

        SN_list_tags = [i for i in SN_list_tags if i is not None]

    elif browser and not update:
        # grab object name list, and remove `Select Option' from list [1:]
        print('Grabbing list of events from WISeREP')
        SN_list_tags = browser.find("select",
                                    {"name": "objid"}).find_all("option")[1:]

    # Begin by selecting event, visiting page, and scraping.
    # SN_list = ['SN2009ip']
    # for item in SN_list:
    for item in SN_list_tags:
        SNname = item.get_text()
        # SNname = item

        if not force and SNname in list_dict['non_SN']:
            print(SNname, 'is not a ' + incl_type_str + ' -- Skipping')
            continue
        elif SNname in list_dict['completed']:
            print(SNname, 'already done')
            continue

        print('Searching for', SNname, '...')

        # reset for every event -- change if needed
        SN_dict = {}

        # if in update mode and SNname directory exists, remove it
        if update:
            rmSNdir(SNname, path)

        # set Obj Name to SNname and retrieve results page
        form['name'] = SNname
        browser.submit_form(form)
        print('\tPage received')

        # locate object header indecies (_idx)
        try:
            headers = browser.find(
                "tr", {"style": "font-weight:bold"}).findChildren("td")
        except AttributeError:
            if update:
                updateListsJson(SNname, list_dict['completed'], list_dict,
                                path)
                print('\t', 'No spectra to collect')
                break
            else:
                updateListsJson(SNname, list_dict['completed'], list_dict,
                                path)
                print('\t', SNname, 'has no available spectra')
                with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                    f.write('From statement 1: ' + SNname +
                            ' has no spectra to collect' + '\n')
            continue

        for i, header in enumerate(headers):
            if header.text == 'Obj. Name':
                obj_name_idx = i
            if header.text == 'IAUName':
                iau_name_idx = i
            if header.text == 'Redshift':
                redshift_idx = i
            if header.text == 'Type':
                type_idx = i
            if header.text == 'No. of publicSpectra':  # publicSpectra not a typo
                num_total_spec_idx = i

        # locate objects returned -- it's not always one
        obj_list = browser.find_all("form", {"target": "new"})
        num_objs = len(obj_list)

        if num_objs >= 1 and update:
            print('\tNew data available for', num_objs, 'objects.')
        if num_objs != 1:
            with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                f.write(
                    str(num_objs) + ' objects returned for ' + SNname + '\n')

        # locate darkred text ``Potential matching IAU-Name'' if it exists
        # the location of html table rows (tr) changes if it exists
        darkred = browser.find(
            "span",
            text=" Potential matching IAU-Name/s:",
            attrs={"style": "color:darkred; font-size:small"})

        # parse obj_list, match to SNname, and find its spectra
        target = ''
        for obj in obj_list:
            obj_header = obj.parent.findChildren("td")
            obj_name = obj_header[obj_name_idx].text

            if SNname == obj_name:
                target = obj_header

                # this checks for spurious page element that changes layout
                if darkred:
                    try:
                        target_spectra = (
                            obj.parent.nextSibling.nextSibling.findChildren(
                                "tr", {"valign": "top"}))
                    except AttributeError:
                        print('\t', SNname, 'has no spectra to collect')
                        with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                            f.write('From statement 2: ' + SNname +
                                    ' has no spectra to collect' + '\n')
                        continue

                elif darkred is None:
                    try:
                        target_spectra = obj.parent.nextSibling.findChildren(
                            "tr", {"valign": "top"})
                    except AttributeError:
                        print('\t', SNname, 'has no spectra to collect')
                        with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                            f.write('From statement 3: ' + SNname +
                                    ' has no spectra to collect' + '\n')
                        continue
        # No match found, skip this event
        if not target:
            continue

        # exclude non-SN
        SNtype = target[type_idx].text
        if not force:
            if ((include_type and SNtype not in include_type) or
                    (not include_type and SNtype in exclude_type)):
                updateListsJson(SNname, list_dict['non_SN'], list_dict, path)
                updateListsJson(SNname, list_dict['completed'], list_dict, path)
                print('\t', SNname, 'is a', SNtype)
                with open(_PATH + path + 'non-' + incl_type_str + '.txt', 'a') as f:
                    f.write(SNname + ' is a ' + SNtype + '\n')
                continue

            elif SNtype == '':
                # SNtype = 'Unspecified by WISeREP'
                print('\tType not specified by WISeREP.',
                      'Check the Open Supernova Catalog for type.')
                with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                    f.write('Type not specified by WISeREP.' +
                            'Check the Open Supernova Catalog for type.')

        # create a directory even if the SN event has no spectra.
        # find other instances of mkSNdir to revert this.
        mkSNdir(SNname, path)

        # second chance to exclude events without spectra
        num_total_spec = target[num_total_spec_idx].text
        num_total_spec = unicodedata.normalize("NFKD", num_total_spec)
        if num_total_spec == u'  ' or num_total_spec == u' 0 ':
            updateListsJson(SNname, list_dict['completed'], list_dict, path)
            print('\t', SNname, 'has no spectra to collect')
            with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                f.write('From statement 4: ' + SNname +
                        ' has no spectra to collect' + '\n')
            continue

        redshift = target[redshift_idx].text

        SN_dict[SNname] = OrderedDict()

        # number of publicly available spectra
        num_pub_spectra = 0

        spec_header = browser.find(
            "tr",
            {"style": "color:black; font-size:x-small"}).findChildren("td")
        for i, header in enumerate(spec_header):
            if header.text == 'Spec. Prog.':
                program_idx = i
            if header.text == 'Instrument':
                instrument_idx = i
            if header.text == 'Observer':
                observer_idx = i
            if header.text == 'Obs.date':
                obsdate_idx = i
            if header.text == 'Reducer':
                reducer_idx = i
            if header.text == 'Ascii/Fits Files':
                filename_idx = i
            if header.text == 'Publish':
                publish_idx = i
            if header.text == 'Contrib':
                contrib_idx = i
            if header.text == 'Last-modified':
                last_mod_idx = i
            if header.text == 'Modified-by':
                modified_by_idx = i

        # build SN_dict and locate ascii files on search results page
        # associated with SNname
        spectrum_haul = OrderedDict()

        for spec in target_spectra:

            spec_link = spec.find("a", href=re.compile(_ASCII_URL))
            try:
                dat_url = quote(spec_link.attrs['href'], "http://")
            except AttributeError:  # handles a return of 'None'
                continue
            children = spec.findChildren("td")
            filename = spec_link.text
            program = children[program_idx].text
            if not force and program in exclude_program:
                print('\tSkipping', program, 'spectrum')
                # but still count it as public
                num_pub_spectra += 1
                continue

            # list of duplicate file prefixes to be excluded
            # list not shorted to ['t', 'f', 'PHASE'] for sanity
            regexes = [
                't' + SNname, 'tPSN', 'tPS', 'tLSQ', 'tGaia', 'tATLAS',
                'tASASSN', 'tSMT', 'tCATA', 'tSNhunt', 'tSNHunt', 'fSNhunt',
                'tSNHiTS', 'tCSS', 'tSSS', 'tCHASE', 'tSN', 'tAT', 'fPSN',
                'PHASE'
            ]

            regexes = "(" + ")|(".join(regexes) + ")"
            if re.match(regexes, filename):
                status = 'rapid'
            else:
                status = 'final'

            instrument = children[instrument_idx].text
            observer = children[observer_idx].text
            obsdate = children[obsdate_idx].text
            reducer = children[reducer_idx].text
            last_modified = children[last_mod_idx].text
            modified_by = children[modified_by_idx].text

            contrib = children[contrib_idx].text
            bibcode = children[publish_idx].text
            bibcode = unicodedata.normalize("NFKD", bibcode)
            if (contrib == ('Ruiz-Lapuente, et al. 1997, Thermonuclear '
                            'Supernovae. Dordrecht: Kluwer')):
                bibcode = '1997Obs...117..312R'
                contrib = 'Ruiz-Lapuente et al. 1997'
            elif '%26' in bibcode:
                bibcode = bibcode.replace('%26', '&')

            SN_dict[SNname][filename] = OrderedDict([
                ("Type", SNtype), ("Redshift", redshift),
                ("Obs. Date", obsdate), ("Program", program),
                ("Contributor", contrib), ("Bibcode", bibcode),
                ("Instrument", instrument), ("Observer", observer),
                ("Reducer", reducer), ("Reduction Status", status),
                ("Last Modified", last_modified), ("Modified By", modified_by)
            ])

            spectrum_haul[filename] = dat_url
            num_pub_spectra += 1

        # Metadata for SNname is now available.
        # The following filters cases by the number of
        # spectra that appear on the WISeREP page.

        if len(spectrum_haul) == 0:
            print('\tNot collecting spectra at this time')
            with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                f.write('Not collecting spectra of ' + SNname + ' at this time'
                        + '\n')

            with open(_PATH + path + SNname + '/README.json', 'w') as fp:
                json.dump(SN_dict[SNname], fp, indent=4)

            updateListsJson(SNname, list_dict['completed'], list_dict, path)
            continue

        elif len(spectrum_haul) == 1:

            # remove host spectrum if it exists
            if SNname in obj_host_dict.keys():
                if obj_host_dict[SNname]['Filename'] in SN_dict[SNname].keys():
                    filename = obj_host_dict[SNname]['Filename']
                    del SN_dict[SNname][filename]

                    print('\tPurging host galaxy spectrum --', filename)

                print('\tNot collecting spectra at this time')
                with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                    f.write('Not collecting spectra of ' + SNname +
                            ' at this time' + '\n')

                updateListsJson(SNname, list_dict['completed'], list_dict,
                                path)
                continue

            print('\tDownloading 1 public spectrum')

            # make SNname subdirectory
            # os.mkdir(_PATH+path+SNname)
            # mkSNdir(SNname, path)

            for filename, url in spectrum_haul.items():
                if filename in wiserep_spectrum_ignore:
                    print('\tIgnoring spectrum for', SNname,
                          '-- see sne-external-spectra/donations')
                    continue
                else:
                    rq = Request(url)
                    res = urlopen(rq)
                    dat = open(_PATH + path + SNname + "/" + filename, 'wb')
                    dat.write(res.read())
                    dat.close()

            # add README for basic metadata to SNname subdirectory
            print('\tWriting README')
            with open(_PATH + path + SNname + '/README.json', 'w') as fp:
                json.dump(SN_dict[SNname], fp, indent=4)

            updateListsJson(SNname, list_dict['completed'], list_dict, path)

        elif len(spectrum_haul) > 1:

            # make SNname subdirectory
            # os.mkdir(_PATH+path+SNname)
            # mkSNdir(SNname, path)

            SN_files = deepcopy(SN_dict[SNname])
            for filename, metadata in SN_files.items():
                if metadata['Reduction Status'] == 'rapid':
                    del SN_dict[SNname][filename]
                    del spectrum_haul[filename]

                    print('\tRemoving duplicate spectrum for', SNname, '--',
                          filename)
                    with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                        f.write('Removing duplicate spectrum for ' + SNname +
                                ' -- ' + filename + '\n')

            # remove host spectrum if it exists
            if SNname in obj_host_dict.keys():
                if obj_host_dict[SNname]['Filename'] in SN_dict[SNname].keys():
                    filename = obj_host_dict[SNname]['Filename']
                    del SN_dict[SNname][filename]

                    print('\tPurging host galaxy spectrum --', filename)

            # need to continue to next supernova if host spectrum was only one
            if len(SN_dict[SNname].keys()) == 0:
                print('\tNot collecting spectra at this time')
                with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                    f.write('Not collecting spectra of ' + SNname +
                            ' at this time' + '\n')
                updateListsJson(SNname, list_dict['completed'], list_dict,
                                path)
                continue

            last_modified = {}
            SN_files = deepcopy(SN_dict[SNname])
            for k, d in SN_files.items():
                for l, e in SN_files.items():
                    aa = d['Obs. Date'] == e['Obs. Date']
                    bb = d['Instrument'] == e['Instrument']
                    cc = d['Observer'] == e['Observer']
                    dd = d['Modified By'] == 'ofer-UploadSet'
                    ee = d['Modified By'] == e['Modified By']
                    if aa and bb and cc and dd and ee and k != l:  # see 2012fs
                        date = SN_dict[SNname][k]['Last Modified']
                        newdate = time.strptime(date, '%Y-%m-%d')
                        last_modified[k] = newdate

                    elif aa and bb and cc and k != l:  # see 2016bau
                        date = SN_dict[SNname][k]['Last Modified']
                        newdate = time.strptime(date, '%Y-%m-%d')
                        last_modified[k] = newdate

            if len(last_modified) <= 1:
                print('\tPresumably no other duplicate files found for',
                      SNname)
                with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                    f.write('Presumably no other duplicate files found for ' +
                            SNname + '\n')

            elif len(last_modified) == 2:
                duplicate = min(last_modified, key=last_modified.get)
                del SN_dict[SNname][duplicate]
                del spectrum_haul[duplicate]

                print('\tRemoving duplicate spectrum for', SNname, '--',
                      duplicate)
                with open(_PATH + path + 'scraper-log.txt', 'a') as f:
                    f.write('Removing duplicate spectrum for ' + SNname +
                            ' -- ' + duplicate + '\n')

            count = 1
            for filename, url in spectrum_haul.items():
                print('\tDownloading', count, 'of', len(SN_dict[SNname]),
                      'public spectra')

                if filename in wiserep_spectrum_ignore:
                    print('\tIgnoring spectrum for', SNname,
                          '-- see sne-external-spectra/donations')
                    continue
                else:
                    rq = Request(url)
                    res = urlopen(rq)
                    dat = open(_PATH + path + SNname + "/" + filename, 'wb')
                    dat.write(res.read())
                    dat.close()

                count += 1

            # add README for basic metadata to SNname subdirectory
            print('\tWriting README')
            with open(_PATH + path + SNname + '/README.json', 'w') as fp:
                json.dump(SN_dict[SNname], fp, indent=4)

            updateListsJson(SNname, list_dict['completed'], list_dict, path)

    # reset completed to 0 once all done
    list_dict['completed'] = []
    with open(_PATH + path + 'lists.json', 'w') as fp:
        json.dump(list_dict, fp, indent=4)

    # execution time in minutes
    minutes = (time.time() - start_time) / 60.0
    print("Runtime: %s minutes" % minutes)
    with open(_PATH + path + 'scraper-log.txt', 'a') as f:
        f.write('Runtime: ' + str(minutes) + ' minutes')
예제 #49
0
파일: torrent.py 프로젝트: rbi13/piv
def scrapeDetails(details):
	regex = 'Uploaded (?P<date>([-:\d\s]+)), Size (?P<sizeNum>(\d+\.\d+)) (?P<sizeMeas>(\w+)), ULed by (?P<uploader>(\w+))'
	# regex = 'Uploaded (?P<date>([-:\d\s]+)),'
	ret = re.search(regex,details.replace(u'\xa0',u' '))
	return ret.groupdict() if ret else {} 
	


base = "http://thepiratebay.se/search/"

search = "the green mile".replace(' ','%20')
opts = "/0/7/0"

url = base+search+opts

browser = RoboBrowser()
browser.open(url)

rows = browser.find_all('tr')

for row in rows:
	# print row
	elem = row.find('font',{'class':'detDesc'})
	if elem:
		print elem.text.replace('\n','')
		print scrapeDetails(elem.text.replace('\n',''))

# browser.follow_link('generateLink')
# browser.follow_link('HERE')
         'bush', 'carson', 'christie', 'cruz', 'fiorina', 'gilmore', 'graham',
         'huckabee', 'jindal', 'kasich', 'pataki', 'paul', 'perry', 'rubio', 'santorum',		 
         'trump', 'walker', 'romney', 'election', 'presidential', 'cycle', 'primary',
         'primaries', 'candidate', 'race']
		 
## dates to search in 2015
months, days = range(1, 9), range(1, 32)
dates = itertools.product(months, days)

## search the archives for potentially relevant material
browser = RoboBrowser(history = True)
relevant_urls = []
bad_urls = []
for date in dates:
    m, d = date[0], date[1]
    archive_url = 'http://www.wsj.com/public/page/archive-2015-' + str(m) + '-' + str(d) + '.html'
    try:
        browser.open(archive_url)
        articles = browser.find_all('h2')
        for article in articles:
            if any(word in article.get_text().lower() for word in terms):
                relevant_urls.append(article.find('a').get('href'))
    except:
        bad_urls.append(archive_url)
        pass 

## save the urls 
with open('wsj_article_urls.txt','rb') as f:
    f.write(json.dumps(relevant_urls))
    f.close() 
예제 #51
0
class StitchBot(object):
    def __init__(self, output_path=None, username=None, password=None):
        self.browser = RoboBrowser(history=True)
        self.output_path = output_path or tempfile.TemporaryDirectory().name

        self.username = username or os.environ['STITCHBOT_USERNAME']
        self.password = password or os.environ['STITCHBOT_PASSWORD']

        self.logger = logger.getChild('StitchBot')

    def log(self, level, method_name, message, *args, **kwargs):
        child_logger = self.logger.getChild(method_name)
        child_logger.log(level, message, *args, **kwargs)

    def scrape(self):
        self.log(logging.INFO, 'scrape', 'Starting scrape')

        self.log_in()
        self.navigate_to_free_pattern()
        scraped_filenames = self.download_pattern()

        self.log(logging.INFO, 'scrape', 'Scrape complete')

        return scraped_filenames

    def log_in(self):
        self.log(logging.INFO, 'log_in', 'Logging in')

        self.browser.open('http://dailycrossstitch.com/my-account/')
        form = self.browser.get_form(class_='login')
        form['username'] = self.username
        form['password'] = self.password
        self.browser.submit_form(form)

        self.log(logging.INFO, 'log_in', 'Logged in')

    def navigate_to_free_pattern(self):
        self.log(
            logging.INFO, 'navigate_to_free_pattern', 'Finding free pattern')

        self.browser.open('http://dailycrossstitch.com/')
        free_button = self.browser.find('a', class_='button', string='FREE')
        self.browser.follow_link(free_button)

        self.log(
            logging.INFO, 'navigate_to_free_pattern', 'Found free pattern')

    def download_pattern(self):
        self.log(logging.INFO, 'download_pattern', 'Downloading pattern')

        download_buttons = self.browser.find_all(
            'a', class_='single_add_to_cart_button')
        download_urls = list(map(itemgetter('href'), download_buttons))
        local_filenames = [
            self.download_pattern_file(url) for url in download_urls]

        self.log(logging.INFO, 'download_pattern', 'Downloaded pattern')

        return local_filenames

    def download_pattern_file(self, url):
        self.log(
            logging.INFO, 'download_pattern_file',
            'Downloading pattern file at {0}'.format(url))

        self.browser.open(url)
        download_script = self.browser.find(
            'script', string=re.compile(r'^\s*function startDownload'))
        if not download_script:
            return

        pdf_url_match = re.search(r'(http.+\.pdf)', download_script.string)
        if not pdf_url_match:
            return

        pdf_url = pdf_url_match.group(1)
        self.browser.open(pdf_url)

        output_filename = self.save_pattern(self.browser.response)

        self.log(
            logging.INFO, 'download_pattern_file',
            'Downloaded pattern file at {0}'.format(url))

        return output_filename

    def save_pattern(self, response):
        self.log(logging.INFO, 'save_pattern', 'Saving pattern')

        try:
            os.makedirs(self.output_path)
        except OSError:
            pass

        filename = self.get_filename(response.headers)
        output_filename = os.path.join(self.output_path, filename)
        with open(output_filename, 'wb') as output_file:
            output_file.write(response.content)

        self.log(
            logging.INFO, 'save_pattern',
            'Saved pattern to {0}'.format(output_filename))

        return output_filename

    def get_filename(self, headers, default_filename='pattern.pdf'):
        filename_match = re.search(
            r'filename="?([^"]+)"?', headers.get('Content-Disposition', ''))
        if not filename_match:
            return default_filename

        return filename_match.group(1)
예제 #52
0
#Actually do the search
searchList = raw_input('What would you like to search for? Separate your search queries with spaces: ')
sum = 0;

# Initialize the link-holder array and the query array
linksToVisit = []
searchQuery = searchList.split(' ')

# Iterate through all the search values
for searchVal in searchQuery:
	browser.open('https://poshmark.com/search?query=' + searchVal + '&type=people')

	# Compile list of users to go to
	print 'Here are all users you are going to visit from search ' + searchVal + ':'
	
	for link in browser.find_all('a'):
		if (str(link.get('href'))[:6] == '/user/' and str(link.get('href'))[-12:] == '/follow_user'):
	   		linksToVisit.append(str(link.get('href')))
	   		print link.get('href')

	# Actually visits them. And sleeps a lot too.
	if (len(linksToVisit) == 0):
		print 'No users were returned'
	else:
		for url in linksToVisit:
			timetosleep = int(random.random() * 10)
			print 'sleeping for ' + str(timetosleep) + ' seconds.'
			time.sleep(timetosleep)
			print 'followed user ' + url
			browser.open('https://poshmark.com' + url)
			time.sleep(int(random.random() * 2))
예제 #53
0
from robobrowser import RoboBrowser

my_url='http://yifyhdtorrent.com/'

file_name='yify.csv'

f=open(file_name,'w')
headers='MOVIE_NAME,RATINGS\n'
f.write(headers)

browser=RoboBrowser(history=True)
browser.open(my_url)

browser.parsed('html')

containers=browser.find_all('div',attrs={'class':'smp-view'})

for container in containers:
	name_container=container.find('div',attrs={'class':'title-video'})
	movie_name=name_container.a
	name=movie_name.text

	ratings=container.div.text.strip()
	
	#print('NAME : '+name+'   \nRatings :'+ ratings[0] +'\n')

	f.write(name+','+ratings[0]+'\n')
	
f.close()
예제 #54
0
파일: custom.py 프로젝트: lincht/TradeModel
def get_custom(year, month):
    
    with open('elevens_list_noname.txt', encoding = 'utf-8', mode = 'r') as file:
        elevens = eval(file.read())

    filename = str(year + 1911) + '-' + str(month).zfill(2) + '.txt'
    with open(filename, encoding = 'utf-8', mode = 'w') as output:
        header = '國家|貨品分類|中文貨名|英文貨品|數量|數量單位|重量|重量單位|價值\n'
        output.write(header)
    
        for good in range(0, len(elevens) // 250 * 250 - 250 + 1, 250):
            goodsGroup = ','.join(elevens[good : good + 250])
        
            payload = [('minYear', '92'),
                       ('maxYear', '105'),
                       ('maxMonth', '6'),
                       ('minMonth', '1'),
                       ('maxYearByYear', '104'),
                       # 3: 進口總值(含復進口), 6: 出口總值(含復出口)
                       ('searchInfo.TypePort', '6'),
                       # 資料週期:0: 按月, 1: 按年
                       ('searchInfo.TypeTime', '0'),
                       # Year range: 92-105
                       ('searchInfo.StartYear', str(year)),
                       ('searchInfo.StartMonth', str(month)),
                       ('searchInfo.EndMonth', str(month)),
                       # 11碼稅則
                       ('searchInfo.goodsType', '11'),
                       ('searchInfo.goodsCodeGroup', goodsGroup),
                       # 請點選國家地區: 全部國家
                       ('searchInfo.CountryName', '請點選國家地區'),
                       # rbMoney1: 新臺幣, rbMoney2: 美元
                       ('searchInfo.Type', 'rbMoney2'),
                       # rbByGood: 按貨品別排列, rbByCountry: 按國家別
                       ('searchInfo.GroupType', 'rbByCountry'),
                       ('Search', '開始查詢')]
                   
            while True:
                try:
                    browser = RoboBrowser()
                    browser.open(url + urllib.parse.urlencode(payload), verify = False)
                    if browser.response.status_code == 200:
                        break
                except:
                    print('An error has occurred. Retrying.')
                    print(browser.response.text)
                    sleep(60)
        
            dataListNumber = 'dataList_' + str(month)
            table = browser.find_all('table', {'id':dataListNumber})
            tds = []
            for table_element in table:
                rows = table_element.find_all('tr')
                for row in rows:
                    td = row.find_all('td')
                    tds.append(td)
                
            data = ''
            for index in range(1, len(tds)):
                row_data = tds[index]
                if row_data[1].text == '合計':
                    continue
                for data_index in range(8):
                    data += row_data[data_index].text + '|'
                data += row_data[8].text + '\n'

            output.write(data)
            terminal_size = shutil.get_terminal_size()[0]
            print('Data for',
                  goodsGroup[0:10] + '-' +
                  goodsGroup[(len(goodsGroup) - 10):len(goodsGroup)],
                  calendar.month_name[month] + ', %s' % (year + 1911),
                  'written on', strftime("%Y-%m-%d %H:%M:%S"))
    
        goodsGroup2 = ','.join(elevens[len(elevens) // 250 * 250 : len(elevens)])

        payload = [('minYear', '92'),
                   ('maxYear', '105'),
                   ('maxMonth', '6'),
                   ('minMonth', '1'),
                   ('maxYearByYear', '104'),
                   # 3: 進口總值(含復進口), 6: 出口總值(含復出口)
                   ('searchInfo.TypePort', '6'),
                   # 資料週期:0: 按月, 1: 按年
                   ('searchInfo.TypeTime', '0'),
                   # Year range: 92-105
                   ('searchInfo.StartYear', str(year)),
                   ('searchInfo.StartMonth', str(month)),
                   ('searchInfo.EndMonth', str(month)),
                   # 11碼稅則
                   ('searchInfo.goodsType', '11'),
                   ('searchInfo.goodsCodeGroup', goodsGroup2),
                   # 請點選國家地區: 全部國家
                   ('searchInfo.CountryName', '請點選國家地區'),
                   # rbMoney1: 新臺幣, rbMoney2: 美元
                   ('searchInfo.Type', 'rbMoney2'),
                   # rbByGood: 按貨品別排列, rbByCountry: 按國家別
                   ('searchInfo.GroupType', 'rbByCountry'),
                   ('Search', '開始查詢')]
                   
        while True:
            try:
                browser = RoboBrowser()
                browser.open(url + urllib.parse.urlencode(payload), verify = False)
                if browser.response.status_code == 200:
                    break
            except:
                print('An error has occurred. Retrying.')
                print(browser.response.text)
                sleep(60)

        dataListNumber = 'dataList_' + str(month)
        table = browser.find_all('table', {'id':dataListNumber})
        tds = []
        for table_element in table:
            rows = table_element.find_all('tr')
            for row in rows:
                td = row.find_all('td')
                tds.append(td)
            
        data = ''
        for index in range(1, len(tds)):
            row_data = tds[index]
            if row_data[1].text == '合計':
                continue
            for data_index in range(8):
                data += row_data[data_index].text + '|'
            if index != len(tds) - 1:
                data += row_data[8].text + '\n'
            else:
                data += row_data[8].text

        output.write(data)
        print('Data for',
              goodsGroup2[0:10] + '-' +
              goodsGroup2[(len(goodsGroup2) - 10):len(goodsGroup2)],
              calendar.month_name[month] + ', %s' % (year + 1911),
              'written on', strftime("%Y-%m-%d %H:%M:%S"))
    
    print('=' * terminal_size +
    calendar.month_name[month] + ', %s' % (year + 1911),
    'data successfully downloaded on', strftime("%Y-%m-%d %H:%M:%S") + '\n' +
    '=' * terminal_size + '\n')
    return()
예제 #55
0
browser = RoboBrowser(session=session)
## DVD Queue
browser.open('http://dvd.netflix.com/Queue?prioritized=true&qtype=DD', cookies = cj)

# get the form
queue_form = browser.get_form(class_='hasAwaitingRelease')
queue_submit = queue_form.submit_fields['updateQueue2']

predictions = []
skip_keys = ["authURL", "updateQueueBtn", "updateQueue1", "queueHeader", "updateQueue2"]
for key in queue_form.keys():
    if key in skip_keys:
        continue
    if 'OP' in key:
        continue
    spans = browser.find_all("input", {"name" : key })[0].findAllNext("span")
    for s in spans:
        if s is not None:
            for k in s.findChildren():
                for c in k['class']:
                    if 'sbmf-' in c:
                        predicted_rating = c.strip("sbmf-")
                        if key not in (item[0] for item in predictions):
                            predictions.append((key, predicted_rating))
sorted_preds = sorted(predictions, key=lambda x: float(x[1]), reverse=True)

# for i in xrange(len(sorted_preds)):
#     in_arg = 
#     in_target
#     queue_form[sorted_preds[i][0]].value = i
# ## form submit not actually working here, it doesn't seem to take