def download_linke(coords, proxy, port, saveFile, saveMode):

    print proxy, port
    print proxy != ""

    url = "http://www.soda-is.com/eng/services/service_invoke/gui.php?" + "xml_descript=soda_tl.xml&Submit2=Month"

    session = Session()
    session.verify = False

    if proxy != "":
        proxies = {proxy: port}
        session.proxies = proxies

    br = RoboBrowser(session=session, parser="lxml")
    br.open(url)

    linke_form = br.get_forms()[1]

    num = len(coords)
    index = 0

    with open(saveFile, saveMode) as f:
        try:
            for coord in coords:
                inlon, inlat = coord
                linke_form["lat"].value = inlat
                linke_form["lon"].value = inlon

                sf = linke_form.submit_fields.getlist("execute")
                br.submit_form(linke_form, submit=sf[0])

                linke_table = br.find("table", {"cellspacing": "0", "cellpadding": "2"})

                linkes = get_monthly_linke_str(get_linke_values(linke_table))
                s = "%s,%s,%s\n" % (format(inlon, "0.5f"), format(inlat, "0.5f"), linkes)

                if len(s) > 48:
                    f.write(s)
                    print "Done with point %i of %i: (%s, %s)" % (
                        index + 1,
                        num,
                        format(inlon, "0.5f"),
                        format(inlat, "0.5f"),
                    )

                index += 1

                br.back()

            print "DONE!"

        except Exception as e:

            not_dl = list(coords[index:])
            with open(saveFile + "_notdownloaded.txt", "w") as nd:
                for c in not_dl:
                    nd.write("%s,%s\n" % (str(c[0]), str(c[1])))
            print e
예제 #2
0
def main():
    # Browse to Rap Genius
    browser = RoboBrowser(history=True)
    browser = RoboBrowser(
        parser="html.parser")  # will get a warning if parser not declared
    browser.open('http://rapgenius.com/')

    # Search for Queen
    form = browser.get_form(action='/search')
    form  # <RoboForm q=>
    form['q'].value = 'queen'
    browser.submit_form(form)

    # Look up the first song
    songs = browser.select('.song_name')
    try:
        browser.follow_link(songs[0])
    except IndexError:
        print("Songs Index doesn't exist!")
        return
    lyrics = browser.select('.lyrics')
    try:
        lyrics[0].text  # \n[Intro]\nIs this the real life...
    except IndexError:
        print("Lyrics Index doesn't exist!")

    # Back to results page
    browser.back()

    # Look up my favorite song
    browser.follow_link('death on two legs')

    # Can also search HTML using regex patterns
    lyrics = browser.find(class_=re.compile(r'\blyrics\b'))
    print(lyrics.text)  # \n[Verse 1]\nYou suck my blood like a leech...
예제 #3
0
def fetch(url):
    browser = RoboBrowser(history=True, parser="html.parser")
    browser.open(url)

    votes = browser.select('.moderatorenSlider a.beitrag')
    followed_links = set()

    total_scores = {}

    for v in votes:
        if v["href"] in followed_links:
            continue
        else:
            followed_links.add(v["href"])
        print(v["href"])
        browser.follow_link(v)
        try:
            scores = extractVotes(browser)
            print(scores)
            for title, score in scores.items():
                if title not in total_scores:
                    total_scores[title] = (score, 1)
                else:
                    score_, num = total_scores[title]
                    total_scores[title] = (score_+score, num+1)
        except Exception as e:
            print(e)
        browser.back()
    return total_scores
예제 #4
0
def download_internal(user_id, from_date, to_date):
    """Download the csv files for the transaction between the given dates"""
    # Create the browser and open the lloyds login page
    browser = RoboBrowser(parser='html5lib')
    browser.open(
        'https://online.lloydsbank.co.uk/personal/logon/login.jsp?WT.ac=hpIBlogon'
    )

    while 'Enter Memorable Information' not in browser.parsed.title.text:
        print(browser.parsed.title.text)
        form = browser.get_form(id='frmLogin')
        form['frmLogin:strCustomerLogin_userID'] = str(user_id)
        form['frmLogin:strCustomerLogin_pwd'] = prompt('Enter password: '******'re logged in, now enter memorable information
    print(browser.parsed.title.text)
    form = browser.get_form(id='frmentermemorableinformation1')
    field = 'frmentermemorableinformation1:strEnterMemorableInformation_memInfo{0}'

    for i in range(1, 4):
        label = browser.find("label", {"for": field.format(i)})
        form[field.format(i)] = '&nbsp;' + prompt(label.text.strip())
    browser.submit_form(form)

    # hopefully now we're logged in...
    print(browser.parsed.title.text)
    links = []
    for link in browser.get_links("View statement"):
        if link.text == "View statement":
            links.append(link)

    # loop through all accounts
    for index, link in enumerate(links):
        acc_name = link['data-wt-ac'].split(" resource")[0]
        print(acc_name)
        print(browser.parsed.title)
        browser.follow_link(link)
        yield acc_name, download_account_internal(browser, from_date, to_date)
        browser.back()
예제 #5
0
def scrape_revigo_csv(input_GOstats_tsv,
                      out_file,
                      pvalue_cutoff=0.05,
                      fdr_cutoff=1.0):
    """ 
    """
    oh = open(out_file, "w")

    # get input goterms from GOstats result
    goterms = GOstats2Revigo(input_GOstats_tsv,
                             pvalue_cutoff=pvalue_cutoff,
                             fdr_cutoff=fdr_cutoff,
                             output_column=3)
    if goterms:
        br = RoboBrowser(parser="lxml")
        br.open("http://revigo.irb.hr/")

        form = br.get_form()
        #print(form)
        form["goList"].value = goterms

        br.submit_form(form)

        download_rsc_link = br.find("a", href=re.compile("toR.jsp"))
        br.follow_link(download_rsc_link)
        #r_code = br.response.content.decode("utf-8")
        #print(r_code)

        br.back()

        download_csv_link = br.find("a", href=re.compile("export.jsp"))
        br.follow_link(download_csv_link)
        csv_content = br.response.content.decode("utf-8")
        oh.write(csv_content)
    else:
        oh.write(
            "term_ID,description,frequency,plot_X,plot_Y,plot_size,log10 p-value,userVal_2,uniqueness,dispensability,representative,eliminated"
        )

    oh.close()
예제 #6
0
def autoRevigo(name):
    name = name
    os.chdir("/home/david/Documents/BenoitLab/RNA-seq/Gprofiler/")
    os.listdir(".")
    file = open(name, "r")

    string = ""

    for line in file.readlines():
        # print(line)
        string = string + line + "\n"

    goterms = string

    br = RoboBrowser(parser="html")
    br.open("http://revigo.irb.hr/")

    form = br.get_form()
    form["goList"].value = goterms

    br.submit_form(form)

    download_rsc_link = br.find("a", href=re.compile("toR.jsp"))
    br.follow_link(download_rsc_link)
    r_code = br.response.content.decode("utf-8")
    print(r_code)

    br.back()

    download_csv_link = br.find("a", href=re.compile("export.jsp"))
    br.follow_link(download_csv_link)
    csv_content = br.response.content.decode("utf-8")

    writefile = open("/home/david/Documents/BenoitLab/RNA-seq/Revigo/" + name,
                     "w")

    writefile.write(csv_content)
    writefile.close()
    def attack(self):
        user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
        accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
        accept_language = 'en-US,en;q=0.5'

        s = requests.Session()
        s.headers['User-Agent'] = user_agent
        s.headers['Accept'] = accept
        s.headers['Accept-Language'] = accept_language

        robo = RoboBrowser(session=s, history=True, parser='html.parser')
        robo.open(self.url)
        if self.atr_form is "id":
            form = robo.get_form(id=self.atr_value)
        elif self.atr_form is "class":
            form = robo.get_form(class_=self.atr_value)
        elif self.atr_form is "name":
            form = robo.get_form(name=self.atr_value)
        elif self.atr_form is "action":
            form = robo.get_form(action=self.atr_value)
        else:
            self.log.append("Tidak Menemukan Form Login")
            return None

        #melakukan input salah sebagai kondisi gagal login saat melakukan Brute Force
        form[self.name_input[0]].value = "xxxxx"
        form[self.name_input[1]].value = "xxxxx"
        robo.submit_form(form)
        urlFailed = str(robo.url)

        #melakukan serangan Brute
        for username in self.user_list:
            for password in self.pass_list:
                robo.open(self.url)
                form[self.name_input[0]].value = username
                form[self.name_input[1]].value = password
                robo.submit_form(form)
                url = str(robo.url)
                if url != urlFailed:
                    self.log_csv.append(["Brute Force", self.url])
                    self.log.append("login is success or you has been locked out of attempts")
                    self.log.append("Url after login : "******"Username        : "******"Password        : "******"Sensitive Data Exposed", self.url])
                            self.log.append("url contain sensitive data maybe have vulnerability")
                    try:
                        #mencari SESSION PHPSESSID
                        sess = robo.session.cookies['PHPSESSID']
                        self.log_csv.append(["Session ID Exposed", self.url])
                        self.log.append("found PHPSESSID maybe have vulnerability fixation attack")
                        self.log.append("PHPSESSID : " + sess)

                        #mencoba logout dan kembali
                        urlLog = robo.url
                        linkLogout = robo.get_link(text="logout")
                        if linkLogout is not None:
                            robo.follow_link(linkLogout)
                            robo.back(n=1)
                            if robo.url == urlLog:
                                self.log.append("session not destroyed maybe have vulnerabilty")
                    except:
                        pass
                    return
                time.sleep(5)
        self.log.append("Brute Force failed - Login not successfull")
        return
			link['href']='https://bitbucket.org'+link['href'].encode("utf-8").strip()
			#link['href']='/odigeoteam/frontend-html5'
		print link['href']
		#print link
		browser.follow_link(link)
	
		branches = browser.select('li.branches')
		if len(branches)>0 :
			print 'branches '+ branches[0].select('span.value')[0].text
	
		tags = browser.select('li.tags')
		if len(tags)>0 :
			print 'tags' + tags[0].select('span.value')[0].text
	
		enlaces = browser.find_all('a')
		#print enlaces
		for enlace in enlaces:
			if enlace.get('href') == '#forks':
				print 'forks '+ enlace.select('span.value')[0].text
			if enlace.get('href') == '#tags':
				print 'tags '+ enlace.select('span.value')[0].text
			if enlace.get('href') == '#branches':
				print 'branches '+ enlace.select('span.value')[0].text
			if enlace.get('href') == '#followers':
				print 'watchers '+ enlace.select('span.value')[0].text
			# Back to results page
			browser.back()
	except Exception:
		pass

def scrapeTerms(num_terms=1, requested_term="", saveHtml=False):

    # define a list for course information
    courses = []

    # create a new browser to navigate search forms
    browser = RoboBrowser(parser='html.parser')

    # open course search page and get term options
    browser.open('https://www.uvic.ca/BAN1P/bwckschd.p_disp_dyn_sched')
    term_options = browser.find_all("option")

    if (debug):
        found_terms = []
        for term_option in term_options:
            found_terms.append(term_option['value'])
        print("found terms: " + str(found_terms))

    # iterate through data for all terms
    for term_option in term_options:

        # isolate term value
        term = term_option['value']

        # skip empty form values
        if len(term) == 0:
            continue

        # collect only the requested amount of data
        if (requested_term != "" and term != requested_term):
            continue

        if num_terms == 0:
            break
        else:
            num_terms -= 1
            print('COLLECTING FROM TERM ' + term + ". \n")

        # get, fill, and submit the search form (there is only one form on this page)
        term_form = browser.get_form()
        term_form['p_term'].value = term
        browser.submit_form(term_form)

        # get a list of all available subjects
        subject_dropdown = browser.find('select', attrs={"name": "sel_subj"})
        for subject_option in subject_dropdown.find_all("option"):

            # isolate subject abbreviation
            subject = subject_option['value']

            # get, fill, and submit the class schedule form
            schedule_form = browser.get_form()
            schedule_form.fields.getlist('sel_subj')[1].value = subject
            browser.submit_form(schedule_form)

            # save the the search result page
            result = str(browser.parsed())

            if saveHtml:
                filename = 'ResultPages\\' + term + subject + '.html'
                with open(filename, 'w') as file:
                    try:
                        file.write(result)
                        print(filename + ' downloaded')
                    except Exception as e:
                        print("Could not save file! ", e)

            # parse the resulting html string
            print('Parsing ' + term + subject + '.html ... ', end="")
            subject_courses = parse(result, term)

            for subject_course in subject_courses:
                courses.append(subject_course)

            # go back to the class search form
            browser.back(1)

        # go back to the term selection form
        browser.back(1)

    # return from scrapeTerms
    return courses
예제 #10
0
def _download_traces(dates, period):

    url = "http://archive.routeviews.org/route-views.wide/bgpdata/"

    MRAI_BIN_DIR = "/srv/agarcia/TFM/"  #path to save files
    if (period == 0):
        dir = MRAI_BIN_DIR + "ONE_DAY_BGP_TRACES_BZ2/"
    elif (period == 1):
        dir = MRAI_BIN_DIR + "THREE_DAY_BGP_TRACES_BZ2/"
    else:
        dir = MRAI_BIN_DIR + "BGP_TRACES_BZ2/"

    if not os.path.exists(dir):
        os.makedirs(dir)

    for _dt in dates:

        dt = datetime.datetime.strptime(_dt, "%Y-%m-%d")
        dt_web = dt.strftime("%Y") + "." + dt.strftime("%m") + "/"

        print("Opening browser...")
        br = RoboBrowser()
        br.open(url)

        #Buscamos la fecha que queremos y hacemos click
        link_date = br.get_link(dt_web)
        br.follow_link(link_date)

        #Buscamos el link UPDATES y hacemos click
        link_update = br.get_link("UPDATES/")
        br.follow_link(link_update)

        #Obtenemos los 2 DIAS antes y despues de la fecha deseada
        #(5 dias en total)
        days = []
        days.append(dt)
        if (period != 0):
            for day_p in range(1, period + 1):
                d_before = dt - datetime.timedelta(days=day_p)
                d_after = dt + datetime.timedelta(days=day_p)
                days.insert(0, d_before)
                days.append(d_after)

        print(days)

        #Para cada dia descargamos todos los BGP update traces
        for day in days:
            print("Downloading files of day " + day.strftime("%Y-%m-%d") +
                  "\n")
            elem = "updates." + day.strftime("%Y") + day.strftime(
                "%m") + day.strftime("%d")
            _dt_web = day.strftime("%Y") + "." + day.strftime("%m") + "/"
            br.back()
            br.back()
            br.follow_link(br.get_link(dt_web))
            br.follow_link(br.get_link("UPDATES/"))
            links = br.get_links(elem)
            for link in links:
                file = (str(link).split('"'))[1]
                url_dw = "http://archive.routeviews.org/route-views.wide/bgpdata/" + _dt_web + "UPDATES/" + file
                filename = dir + file

                r = requests.get(url_dw)
                with open(filename, "wb") as code:
                    code.write(r.content)
예제 #11
0
def get_legistar_entries(past_entries, city, search_regex):

    agenda_url = city["agenda_site"]

    browser = RoboBrowser(history=True)
    header = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
    }
    s = requests.Session()
    s.headers = header
    browser = RoboBrowser(session=s, parser="lxml")

    #Try to open the Legistar site
    try:
        browser.open(agenda_url)
        browser.submit_form(browser.get_form())
        links = browser.find_all(href=re.compile("View\.ashx\?M\=A"))
    except:
        print("There was a problem opening the URL: " + agenda_url)
        print("Aborting search for agendas from " + city["name"])
        return [], []

    positive_results = []
    new_agendas = []

    for link in links:
        meetingid = str(link)
        pdf_url = city["root_site"] + str(link['href'])
        meetingid = meetingid[meetingid.find(";ID=") + 4:]
        meetingid = meetingid[0:6]

        if not any(meetingid in entry for entry in past_entries):
            #print(l)
            new_agendas = new_agendas + [meetingid]
            browser.follow_link(link)
            content = browser.response.content

            term_match = search_pdf(meetingid, content, search_regex)

            browser.back()

            if (len(term_match) > 0):
                page_body = str(browser.response.content)
                if city["uses_meetingagenda"] == True:
                    deets = re.findall(
                        "\:" + meetingid + ",.*?" + meetingid +
                        ".*?MeetingAgendaStatus", page_body)
                    details = ''.join([line for line in deets[0].split('\\')])
                    details = ''.join([line for line in details.split('\"')])
                    details = ''.join([line for line in details.split(':')])
                    details = details.replace("u0026", '&')
                    meeting_date = details[details.find("start") +
                                           5:details.find("end")].split()[0]
                else:
                    index1 = page_body.find("View.ashx?M=A&amp;ID=" +
                                            meetingid)
                    page_body = page_body[0:index1]
                    index2 = page_body.rfind("<tr")
                    page_body = page_body[index2:]
                    date_matches = re.findall('[\\d]+/[\\d]+/\\d\\d\\d\\d',
                                              page_body)
                    meeting_date = date_matches[0]

                #turn into string of hash tags
                matches = ""
                for term in set(term_match):
                    for bogus in ['-', ' ']:
                        if bogus in term:
                            term = term.replace(bogus, "")
                    matches = matches + "#" + term + ", "

                positive_results.append(
                    (meetingid, "#" + city["short"] + " #" + city["hash_tag"] +
                     " city meeting on " + meeting_date + " about " + matches,
                     pdf_url))

    return new_agendas, positive_results
예제 #12
0
def get_non_legistar_entries(past_entries, city, search_regex):
    positive_results = []
    new_agendas = []

    browser = RoboBrowser(history=True)
    header = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
    }
    s = requests.Session()
    s.headers = header
    browser = RoboBrowser(session=s, parser="lxml")
    agenda_url = city["agenda_site"]

    #non-Legistar sites need to be very specific - these sites could throw anything at you.
    #if you need to add another city, follow this format:
    #if
    if city["short"] == "berkeley":
        try:
            browser.open(agenda_url)
            links = browser.find_all("a", title="Agenda")
        except:
            print("There was a problem opening the URL: " + agenda_url)
            print("Aborting search for agendas from " + city["name"])
            return [], []

        for link in links:
            url = city["root_site"] + str(link['href'])
            meetingid = url[url.rfind("/") + 1:url.rfind(".aspx")]
            #print(meetingid)
            if not any(meetingid in entry for entry in past_entries):

                new_agendas = new_agendas + [meetingid]
                browser.follow_link(link)
                content = str(browser.response.content)
                content = content.lower()
                content = content[content.find("innercontentcontainer"):]

                term_match = []
                m = re.findall(search_regex, content.lower())
                if m is not None and len(m) > 0:
                    term_match = term_match + list(set(m))

                browser.back()

                if (len(term_match) > 0):
                    page_body = str(browser.response.content)
                    index1 = page_body.find(meetingid)
                    page_body = page_body[0:index1]
                    index2 = page_body.rfind("<tr>")
                    page_body = page_body[index2:]
                    deets = re.findall('[\\d]+/[\\d]+', page_body)
                    meeting_date = deets[0]
                    matches = ""
                    for term in set(term_match):
                        for bogus in ['-', ' ']:
                            if bogus in term:
                                term = term.replace(bogus, "")
                        matches = matches + "#" + term + ", "
                    positive_results.append(
                        (meetingid, "#" + city["short"] + " #" +
                         city["hash_tag"] + " city meeting on " +
                         meeting_date + " about " + matches, url))

    elif city["short"] == "berkeleyprc" or city["short"] == "berkeleyp&j":
        try:
            browser.open(agenda_url)
            links = browser.find_all("a", title=re.compile(".genda"))
        except:
            print("There was a problem opening the URL: " + agenda_url)
            print("Aborting search for agendas from " + city["name"])
            return [], []

        for link in links:
            meetingid = str(link)
            url = city["root_site"] + str(link['href']).replace(" ", "%20")
            #print(url)
            pdf_index = url.rfind(".pdf")
            if pdf_index < 0:
                meetingid = url[url.rfind("/") + 1:]
                if not any(meetingid in entry for entry in past_entries):
                    new_agendas = new_agendas + [meetingid]
                continue
            meetingid = url[url.rfind("/") + 1:pdf_index]
            if not any(meetingid in entry for entry in past_entries):
                new_agendas = new_agendas + [meetingid]
                browser.follow_link(link)
                content = browser.response.content

                term_match = search_pdf(meetingid, content, search_regex)

                browser.back()
                if (len(term_match) > 0):
                    searchdex = str(link['title'])
                    deets = searchdex.split()
                    meeting_date = deets[0].lower()
                    for bogus in string.ascii_letters:
                        if bogus in meeting_date:
                            meeting_date = meeting_date.replace(bogus, "")

                    matches = ""
                    for term in set(term_match):
                        for bogus in ['-', ' ']:
                            if bogus in term:
                                term = term.replace(bogus, "")
                        matches = matches + "#" + term + ", "
                    positive_results.append(
                        (meetingid,
                         "#" + city["short"] + " #" + city["hash_tag"] +
                         " mtg on " + meeting_date + " about " + matches, url))
    else:
        return [], []

    return new_agendas, positive_results
예제 #13
0
b.response.status_code
b.links
dir(b)
b.get_links()
b = RoboBrowser(parser="lxml")
b.open("http://www.chandrashekar.info")
b.get_links()
b.get_links()
b.forms
dir(b)
b.get_links()
b.get_links()[-3]
l = b.get_links()[-3]
b.follow_link(l)
b.url
b.back()
b.url
b.forward()
b.url
b.get_forms()
b.get_forms()[0]
f = b.get_forms()[0]
f
f["name"] = "smith"
f["subject"] = "dslfj lsdjf lsdjf lksdj flsdjf"
f
b.submit_form(f)
import requests
r = requests.get("http://pypi.python.org/pypi", params={":action" : "search",
                               "term" : term,
                               "submit" : "search"})
def download_linke(coords, proxy, port, saveFile, saveMode):

    # print proxy,  port
    # print proxy != ''

    url = ("http://www.soda-is.com/eng/services/service_invoke/gui.php?" +
           "xml_descript=soda_tl.xml&Submit2=Month")

    # url = "http://www.soda-pro.com/web-services/atmosphere/turbidity-linke-2003"

    session = Session()
    session.verify = False

    if proxy != '':
        proxies = {proxy: port}
        session.proxies = proxies

    br = RoboBrowser(session=session, parser="lxml")
    br.open(url)

    linke_form = br.get_forms()[1]

    num = len(coords)
    index = 0

    with open(saveFile, saveMode) as f:
        try:
            for coord in coords:
                inlon, inlat = coord
                linke_form['lat'].value = inlat
                linke_form['lon'].value = inlon

                sf = linke_form.submit_fields.getlist('execute')
                br.submit_form(linke_form, submit=sf[0])

                linke_table = br.find("table", {
                    "cellspacing": "0",
                    "cellpadding": "2"
                })

                linkes = get_monthly_linke_str(get_linke_values(linke_table))
                s = "%s,%s,%s\n" % (format(inlon, '0.5f'), format(
                    inlat, '0.5f'), linkes)

                if len(s) > 48:
                    f.write(s)
                    print "Done with point %i of %i: (%s, %s)" % (
                        index + 1, num, format(inlon,
                                               '0.5f'), format(inlat, '0.5f'))

                index += 1

                br.back()

            print "DONE!"

        except Exception as e:

            not_dl = list(coords[index:])
            with open(saveFile + "_notdownloaded.txt", "w") as nd:
                for c in not_dl:
                    nd.write("%s,%s\n" % (str(c[0]), str(c[1])))
            print e
예제 #15
0
def scrape_revigo(data_dir):

    f = open(os.path.join(data_dir, "goterms.txt"), "r")
    goterms = f.read()
    f.close()

    br = RoboBrowser(parser="lxml")
    br.open("http://revigo.irb.hr/")

    form = br.get_form()
    form["goList"].value = goterms

    br.submit_form(form)

    download_rsc_link = br.find("a", href=re.compile("toR.jsp"))
    br.follow_link(download_rsc_link)
    r_code = br.response.content.decode("utf-8")

    f = open(os.path.join(data_dir, "rsc.R"), "a")
    f.write(r_code)
    f.close()

    br.back()

    download_csv_link = br.find("a", href=re.compile("export.jsp"))
    br.follow_link(download_csv_link)
    csv_content = br.response.content.decode("utf-8")

    f = open(os.path.join(data_dir, "rsc.csv"), "a")
    f.write(csv_content)
    f.close()

    br.back()

    download_tree_link = br.find("a", href=re.compile("toR_treemap.jsp"))
    br.follow_link(download_tree_link)
    r_code = br.response.content.decode("utf-8")

    f = open(os.path.join(data_dir, "tree_map.R"), "a")
    f.write(r_code)
    f.close()

    br.back()

    download_csv_link = br.find("a", href=re.compile("export_treemap.jsp"))
    br.follow_link(download_csv_link)
    csv_content = br.response.content.decode("utf-8")

    f = open(os.path.join(data_dir, "tree_map.csv"), "a")
    f.write(csv_content)
    f.close()

    br.back()

    # get cytoscape graph
    cytoscape_link = br.find("a", href=re.compile("download.jsp"))
    br.follow_link(cytoscape_link)
    cytoscape_content = br.response.content.decode("utf-8")

    f = open(os.path.join(data_dir, "cytoscape_map.xgmml"), "a")
    f.write(cytoscape_content)
    f.close()
예제 #16
0
import re
from robobrowser import RoboBrowser

# Browse to Rap Genius
browser = RoboBrowser(history=True)
browser.open('http://rapgenius.com/')

# Search for Queen
form = browser.get_form(action='/search')
form  # <RoboForm q=>
form['q'].value = 'queen'
browser.submit_form(form)

# Look up the first song
songs = browser.select('.song_name')
browser.follow_link(songs[0])
lyrics = browser.select('.lyrics')
lyrics[0].text  # \n[Intro]\nIs this the real life...

# Back to results page
browser.back()

# Look up my favorite song
browser.follow_link('death on two legs')

# Can also search HTML using regex patterns
lyrics = browser.find(class_=re.compile(r'\blyrics\b'))
lyrics.text
예제 #17
0
def scrape_bio_and_albums(keywords):
    """
    scrapes artists' data on https://www.lyrics.com/, and stores text (bio and albums)
    from an HTML page source of each artist's page in file (text&csv files).
    :param keywords: list of keywords that should represent artists' names (list)
    :return:
    """
    if keywords and isinstance(keywords, list):

        # builds new object of RoboBrowser with given params
        browser = RoboBrowser(
            parser='html.parser',
            user_agent=
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
            '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            history=True,
            timeout=10)
        # Open a URL (using 'RoboBrowser' library).
        browser.open(BASE_URL)

        for keyword in keywords:
            if keyword and len(keyword) > 1:

                # get browser url (should be 'old' after searching a term - if browser goes to new url)
                old_url = browser.url

                # trying to search keyword on 'lyrics.com' (using RoboBrowser's methods to handle forms)
                form = browser.get_form(
                    id='search-frm')  # Find form by ID 'search-frm'
                form[
                    'st'].value = keyword  # sets query value 'st' with given keyword
                browser.submit_form(
                    form)  # Submit a form - to search given keyword

                # check if the url is changed (after searching a keyword)
                if old_url != browser.url:

                    # select required <a> tags, using CSS Selectors (see on BeautifulSoup's documentation)
                    a_tags = browser.select(
                        'body p[class~=serp-flat-list] a[href^=artist/]')

                    if a_tags:
                        # browser.follow_link(a_tags[0])

                        # builds base url with href - to open required url using 'open()' method,
                        # and avoid including the "/lyrics/" part in url, when using 'follow_link()' method
                        first_artist_url = a_tags[0]['href'].replace(
                            "artist", BASE_URL + "artist")

                        # Open URL (should get url of the first suggested artist's page in results)
                        browser.open(first_artist_url)

                        # parse response content (bs4 obj), using HTML parser specified by the browser
                        soup = browser.parsed

                        if soup:
                            artist_bio_tag = soup.find(
                                class_='artist-bio')  # find tag by class
                            if artist_bio_tag:
                                # save parsed text (artist bio) from page source to a text file
                                save_source(keyword + " - bio",
                                            artist_bio_tag.get_text(),
                                            dir_path=os.path.join(
                                                ARTISTS_PATH, keyword))
                                # parse albums&songs from html tables, and save the data to a csv file
                                albums_to_csv(soup,
                                              keyword + " - albums",
                                              dir_path=os.path.join(
                                                  ARTISTS_PATH, keyword))

                        browser.back()  # Go back in browser history.
                    browser.back()  # Go back in browser history.
        break
    curImp = []
    curText = br.parsed()
    for j in curText:
        curImp.extend(j.find_all('div', class_='td-post-content'))
    curS = list(set(curImp))
    curRow = object()
    for i in curS:
        curRow = i
        break
    print("Movie Data")
    star = curRow.p
    starText = star.get_text()
    cast = star.next_sibling
    castText = cast.get_text()
    direct = cast.next_sibling
    directText = direct.get_text()
    starsSplit = starText.split(":")
    starsAct = starsSplit[1] if len(starsSplit) >= 2 else ""
    castSplit = castText.split(":")
    castAct = castSplit[1] if len(castSplit) >= 2 else ""
    directSplit = directText.split(":")
    directAct = directSplit[1] if len(directSplit) >= 2 else ""

    string = names + "," + date + "," + firstday + "," + firstWeekend + "," + lifetime + "," + starsAct + "[" + castAct + "]" + "," + "[" + directAct + "]" + "\n"
    print(string)
    csv.write(string)
    br.back()
if csv:
    csv.close()