예제 #1
0
파일: route.py 프로젝트: ashawkey/luna
 def crawl(keyword):
     urls = []
     g = Grab()
     g.go(f'http://www.yinghuacd.com/search/{keyword}/')
     candidate_selectors = g.doc(f'//div[@class="lpic"]/ul/li/a/@href')
     # tmpfix: avoid too many inaccurate results
     if len(candidate_selectors) >= 3:
         candidate_selectors = candidate_selectors[0:3]
         print(f'[WARN] too many candidates for {keyword}, only keep the first three.')
     # for candidates
     for candidate in candidate_selectors:
         g.go(f'http://www.yhdm.so{candidate.text()}')
         episode_selectors = g.doc('//div[@class="movurl"]/ul/li/a/@href')
         # tmpfix: avoid too many episodes
         if len(episode_selectors) >= 30:
             episode_selectors = episode_selectors[:30]
             print(f'[WARN] too many episodes, only keep the first 30.')
         # for episodes
         for episode in episode_selectors:
             g.go(f'http://www.yhdm.so{episode.text()}')
             title_selectors = g.doc('//div[@class="gohome l"]/h1')
             data_selectors = g.doc('//div[@id="playbox"]/@data-vid')
             title = title_selectors[0].text()
             url = data_selectors[0].text()
             # tmpfix 
             if url[-4:] == "$mp4":
                 url = url[:-4]
             # append to results
             print(f'[INFO] crawled {title} {url}')
             urls.append({
                 'formattedUrl': url,
                 'title': title,
                 'snipped': '',
             })
     return urls
예제 #2
0
파일: get.py 프로젝트: qwjhb/syosetu
def download():
    url=input('url:')
    print('ddd')
    pool = ThreadPool(4)
    main_page=Grab()
    main_page.go(url)
    title=main_page.doc('//*[@id="novel_color"]/p').text()
    if os.path.isdir('temp'):
       shutil.rmtree('temp')
    os.mkdir('temp')
    path='temp/'
    main_page.doc.save(path+'main.html')
    urls_xpath = main_page.doc('//*[@id="novel_color"]/div/dl/dd/a')
    i=1
    urls=[]
    dict={}
    for url in urls_xpath:
        urls.append('http://novel18.syosetu.com'+ url.attr('href'))
        key= url.attr('href').split('/')[2]
        dict[key]=url.text().replace('/',' ')
    print(dict)
    def pages(page_url):
        num=page_url.split('/')[4]
        f = urllib.request.urlopen(page_url)
        data = f.read()
        with open(path+'%04u.%s.html'%(int(num),dict[num]), "wb") as code:
            code.write(data)
        print(num,dict[num])
    pool.map(pages,urls)
예제 #3
0
 def get_new_address(self):
     base_url = "http://www.fakemailgenerator.com/"
     g = Grab()
     g.setup(connect_timeout=20, timeout=20)
     g.go(base_url)
     if g.response.code == 200:
         email_name = g.doc('//*[@id="home-email"]/@value').text()
         email_site = g.doc('//*[@id="domain"]').text()
         self.full_email = email_name, email_site
예제 #4
0
def feed_http(request):
    """HTTP Cloud Function.
    Args:
        request (flask.Request): The request object.
        <http://flask.pocoo.org/docs/1.0/api/#flask.Request>
    Returns:
        The response text, or any set of values that can be turned into a
        Response object using `make_response`
        <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>.
    """
    request_args = request.args
    url = request_args['url']
    g = Grab()
    fg = FeedGenerator()
    g.go(url)

    fg.id(url)
    fg.title('Rabota.UA | rss feed')
    url_parsed = urlparse(g.response.url)
    fg.link(href=url_parsed.scheme + '://' + url_parsed.hostname,
            rel='alternate')
    fg.description(g.doc('/html/head/title').text())
    count = int(
        g.doc('//span[@id="ctl00_content_vacancyList_ltCount"]/span').one().
        text())
    if count == 0:
        itm_list = []
    else:
        articles = g.doc.select(
            '//table[contains(@class, "f-vacancylist-tablewrap")]').one()
        itm_list = articles.select(
            'tr[@id]/td/article/div[contains(@class, "card-body")]')
    for item in itm_list:
        vac_title = item.select(
            'div[1]//h2[contains(@class, "card-title")]/a/@title').text(
            ).strip()
        vac_url = g.make_url_absolute(
            item.select(
                'div[1]//h2[contains(@class, "card-title")]/a/@href').text())
        try:
            vac_description = item.select(
                'div[contains(@class, "card-description")]').text().strip()
        except weblib.error.DataNotFound:
            vac_description = 'N/A'
        fe = fg.add_entry()
        print(vac_title)
        fe.id(vac_url)
        fe.link({'href': vac_url})
        fe.source(vac_url)
        fe.title(vac_title)
        fe.description(vac_description)

    response = make_response(fg.atom_str(pretty=True, extensions=False))
    response.headers['Content-Type'] = 'application/rss+xml; charset=UTF-8'
    return response
예제 #5
0
class MultsInfo:
    def __init__(self, url):
        self.url = url
        self.g = Grab()
        
        self.go()
    
    def go(self):
        self.g.go(self.url)
        self.name = self.g.doc('//html/body/center/div/div[2]/h1').text().split('"')[1]
        self.year = self.g.doc('//html/body/center/div/div[2]/div[3]/p[1]/a[1]').text().split()[1]
        self.turl = "http://mults.info/" + self.g.doc('//a[b="torrent"]/@href').text()
예제 #6
0
 def get_by_name(name):
     url = KINOPOISK_SEARCH % quote(name)
     headers = {
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'
     }
     g = Grab(log_dir = 'tmp', referer = 'https://www.kinopoisk.ru/', headers = headers)
     g.go(url)
     return g.doc('//meta[@name="twitter:app:url:iphone"]/@content').text().split('/')[-1]
예제 #7
0
class KPMovie:
    def __init__(self, kid, cached = None):
        # Movie id.
        self.kid = kid
        self.g = Grab()
        self.cached = cached
                

        self.go()
    
    def go(self):
        if self.cached:
            print 'Loading from cache', self.kid
            c = ('%s' % self.cached).encode('cp1251', 'ignore')
            #print type(c)
            self.g.doc.body = c
            self.g.doc.parse()
        else:
            print 'Downloading', self.kid
            self.g.go(self._url())
            self.cached = self.g.doc.unicode_body()

        self.name_ru = self.g.doc('//h1[contains(@itemprop, "name")]').text()
        self.name_en = self.g.doc('//span[contains(@itemprop, "alternativeHeadline")]').text()
        self.year = self.g.doc('//table[contains(@class, "info")]/tr/*/div/a').text()
        self.genres = [el.text() for el in self.g.doc('//span[contains(@itemprop, "genre")]/a')]
        
    def _url(self):
        return KINOPOISK_URL % self.kid
    
    @staticmethod
    def get_by_name(name):
        url = KINOPOISK_SEARCH % quote(name)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'
        }
        g = Grab(log_dir = 'tmp', referer = 'https://www.kinopoisk.ru/', headers = headers)
        g.go(url)
        return g.doc('//meta[@name="twitter:app:url:iphone"]/@content').text().split('/')[-1]
예제 #8
0
def ycombinator():
    spider = Grab()
    main_domain = 'https://news.ycombinator.com/'
    spider.go(main_domain)
    items = spider.doc('//td[@class="title"]/a[@class="storylink"]')

    links = items.attr_list('href')
    links = list(
        map(lambda url: url if 'https://' in url or 'http://' in url else "{}{}".format(main_domain, url), links)
    )
    title = items.text_list()

    result = [{'url': data[0], 'title': data[1]} for data in list(zip(links, title))]
    return result
예제 #9
0
def fetch_contacts(username, password):

    basicConfig(level=DEBUG)

    g = Grab()

    home_url = 'https://www.linkedin.com'
    g.go(home_url + '/uas/login')
    g.doc.set_input('session_key', username)
    g.doc.set_input('session_password', password)
    g.doc.submit()

    def get_nb_contacts():
        elem = g.doc('//li[@class="nav-item account-settings-tab"]/a')
        own_page = elem.attr('href')
        g.go(own_page)
        #g.doc.save('z.html')
        html = g.doc.select('//*[@id="top_card-content"]').html()
        start = html.find('{')
        com = html[start:-10]
        content = loads(com)["content"]
        res = content["ContactInfo"]["distance"]["numberOfConnections"]
        return res

    nb_contacts = 0
    while nb_contacts == 0:
        try:
            nb_contacts = get_nb_contacts()
        except:
            pass

    contacts_url = g.doc('//*[@id="advanced-search"]/@href').text()
    g.go(contacts_url)

    def process_comments(commented_line):
        start = commented_line.find('{')
        tmp = commented_line[start:-10]
        res = loads(tmp)["content"]["page"]["voltron_unified_search_json"]["search"]
        next_page_url = res["baseData"]["resultPagination"]["nextPage"]["pageURL"]
        results = res["results"]
        contacts = [ X.itervalues().next() for X in results ]
        return contacts, home_url + next_page_url

    def process1contact(contact):
        s_dict = defaultdict(lambda: '')
        l_dict = defaultdict(lambda: [])

        s_dict["lastname"] = contact["lastName"]
        s_dict["firstname"] = contact["firstName"]
        try:
            s_dict["id"] = contact["id"]
        except:
            s_dict["id"] = ''
        try:
            s_dict["job_title"] = contact["fmt_headline"]
        except:
            s_dict["job_title"] = ''
        print(s_dict)

        links_profile = []
        for key in contact.keys():
            if 'link_nprofile_view' in key:
                links_profile.append(key)
        if len(links_profile) == 0:
            print("ERROR: No 'link_profile' provided in contact.keys()")
        link_profile = links_profile[0]
        url_contact = contact[link_profile]

        g.go(url_contact)

        # email
        try:
            s_dict["email"] = g.doc.select('//a[contains(@href,"mailto")]').text()
        except:
            s_dict["email"] = ''

        # phone number
        try:
            s_dict["phone"] = g.doc.select('//div[@id="phone-view"]/ul/li').text()
        except:
            s_dict["phone"] = ''

        # skills
        for elem in g.doc.select('//ul[@class="skills-section"]//span[contains(@class, "endorse-item-name-text")]'):
            l_dict["main_skills"].append(elem.text())
        for elem in g.doc.select('//ul[@class="skills-section compact-view"]//span[contains(@class, "endorse-item-name-text")]'):
            l_dict["other_skills"].append(elem.text())

        # companies
        tmp = g.doc.select('//div[@class="editable-item section-item current-position"]//a[contains(@href, "company-name")]')
        for elem in tmp:
            s_dict["current_company"] += elem.text()
        tmp = g.doc.select('//div[@class="editable-item section-item past-position"]//a[contains(@href, "company-name")]')
        for elem in tmp:
            if len(elem.text()) > 0:
                l_dict["former_companies"].append(elem.text())

        # summary
        try:
            s_dict["summary"] = g.doc.select('//div[@class="summary"]/p').text()
        except:
            s_dict["summary"] = ''

        # languages
        tmp = g.doc.select('//div[@id="languages"]//li[@class="section-item"]/h4/span')
        for elem in tmp:
            l_dict["languages"].append(elem.text())

        # projects
        project_names = g.doc('//div[@id="background-projects"]//span[@dir="auto"]')
        project_dates = g.doc('//div[@id="background-projects"]//span[@class="projects-date"]/time')
        for name, date in zip(project_names, project_dates):
            l_dict["projects"].append((name.text(), date.text()))

        # certifications
        certification_titles = g.doc('//div[@id="background-certifications"]//a[contains(@href,"certification_company_title")]')
        certification_orgs = g.doc('//div[@id="background-certifications"]//a[contains(@href,"certification-org_name")]')
        certification_dates = g.doc('//div[@id="background-certifications"]//span[@class="certification-date"]/time')
        for title, org, date in zip(certification_titles, certification_orgs, certification_dates):
            title_text = title.text()
            end = title_text.find('(')
            l_dict["certifications"].append((title_text[:end], org.text(), date.text()))

        # coursework
        schools = g.doc('//div[@id="background-education-container"]//a[contains(@href,"edu-school-name")]')
        dates = g.doc('//div[@id="background-education-container"]//span[@class="education-date"]')
        for school, date in zip(schools, dates):
            l_dict["coursework"].append((school.text(), date.text()))

        # graduation year
        try:
            end_schools = [ int(date.text().split(' ')[-1]) for date in dates ]
            s_dict["graduation_year"] = max(end_schools)
        except:
            s_dict["graduation_year"] = ''

        # experiences
        xp_titles = g.doc('//div[@id="background-experience"]//a[contains(@href,"profile_title")]')
        xp_comps = g.doc('//div[@id="background-experience"]//a[contains(@href,"company-name")]')
        xp_comps = filter(lambda x: len(x.text()) > 0, xp_comps)
        xp_dates = g.doc('//div[@id="background-experience"]//span[@class="experience-date-locale"]/time')
        xp_places = g.doc('//div[@id="background-experience"]//span[@class="locality"]')
        xp_summaries = g.doc('//div[@id="background-experience"]//p[@class="description summary-field-show-more"]')
        for title, company, date, place, summary in zip(xp_titles, xp_comps, xp_dates, xp_places, xp_summaries):
            l_dict["experiences"].append((title.text(), company.text(), date.text(), place.text(), summary.text()))

        def pretty_tuple(T):
            if type(T) == tuple:
                return ', '.join(t.encode('utf-8') for t in T)
            else:
                return T
        pretty_list = lambda L: '\n'.join(pretty_tuple(l) for l in L)

        s_l_dict = {k:pretty_list(v) for k, v in l_dict.items()}
        s_dict.update(s_l_dict)
        return s_dict

    import os
    path_to_desktop = os.path.expanduser('~/Desktop/')
    filename = path_to_desktop + 'contacts.xlsx'
    if os.path.isfile(filename):
        saved_df = read_excel(filename)
        saved_IDs = saved_df["id"].values
    else:
        saved_IDs = []

    contacts = []
    nb_pages = int((nb_contacts-1)/10 + 1)
    #nb_pages = 2
    for i in range(nb_pages):
        comments = g.doc.select('//*[@id="voltron_srp_main-content"]').html()
        new_contacts, next_page_url = process_comments(comments)
        if i == nb_pages - 1:
            new_contacts = filter(lambda contact: contact["distance"] == 1, new_contacts)
        new_contacts = filter(lambda contact: contact["id"] not in saved_IDs, new_contacts)
        contacts.extend(new_contacts)
        g.go(next_page_url)

    processed_contacts = [ process1contact(contact) for contact in contacts ]

    df = DataFrame(processed_contacts)
    cols_sorted = ["lastname", "firstname", "email", "phone", "job_title", "main_skills",
            "other_skills", "current_company", "former_companies", "certifications",
            "projects", "graduation_year", "coursework", "summary", "experiences",
            "languages", "id"]
    df_cols = df.columns.tolist()
    cols = [col for col in cols_sorted if col in df_cols]
    df = df[cols]

    if os.path.isfile(filename):
        df = saved_df.append(df)

    df.to_excel(filename, sheet_name='sheet1', index=False)

    return processed_contacts
예제 #10
0
from grab import Grab

spider = Grab()
main_domain = 'https://news.ycombinator.com/'
spider.go(main_domain)
items = spider.doc('//td[@class="title"]/a[@class="storylink"]')

links = items.attr_list('href')
links = list(
    map(
        lambda url: url
        if 'https://' in url or 'http://' in url else "{}{}".format(
            main_domain, url), links))
title = items.text_list()

result = list(zip(links, title))
예제 #11
0
import logging

from grab import Grab

logging.basicConfig(level=logging.DEBUG)

g = Grab()
g.setup(
    headers={
        'User-Agent':
        'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    })

g.go('https://www.youtube.com/channel/UCy3eOZh9n5P3aWWKXmrq6yg')

# g.doc.save('x.html')

result = g.doc('//a[@id="video-title"]').exists()

print(result)
예제 #12
0
# crawl github project list

from grab import Grab
import logging

logging.basicConfig(level=logging.DEBUG)
g = Grab()
g.go('https://github.com/login')
print g.doc.form
g.doc.set_input('login', '*****@*****.**')
g.doc.set_input('password', '')
g.doc.submit()
g.doc.save('/tmp/x.html')


home_url = g.doc('//a[contains(@class, "header-nav-link name")]/@href').text()
repo_url = home_url + '?tab=repositories'

g.go(repo_url)
for elem in g.doc.select('//h3[@class="repo-list-name"]/a'):
    print('%s: %s' % (elem.text(),
                      g.make_url_absolute(elem.attr('href'))))




# from grab.spider import Spider, Task
# import logging
#
# class ExampleSpider(Spider):
#     def task_generator(self):