Exemplo n.º 1
0
def parse_category(url, category):
    page = requests.get(url).content
    soup = BeautifulSoup(page)
    # all_jobs = soup.findAll('td', class_='left')
    all_jobs = soup.findAll('tr')
    for job in all_jobs:
        url = 'https://freelancehunt.com' + job.find('a').attrs['href']
        title = job.find('a').text
        text = job.find('a').attrs['title']
        date = job.attrs['data-published']
        # from IPython import embed; embed()
        try:
            price = job.find('div', class_='price').text.split('\n')[1]
        except AttributeError:
            price = None
        # print(job)
        print('\nDate:', date,\
              '\nTitle:', title,\
              '\nText:', text,\
              '\nPrice:', price,\
              '\nURL:', url, '\n\n'
        )
        if not job_exist(url):
            job_row = Job(title=title,
                          date=date,
                          price=price,
                          url=url,
                          category=category,
                          parse_date=datetime.now())
            session.add(job_row)

    session.commit()
Exemplo n.º 2
0
def parse_category(url, category):
    page = urllib2.urlopen(url, context=ctx)
    soup = BeautifulSoup(page)
    all_jobs = soup.findAll('div', {'class': 'jobsearch-result-list'})

    for job in all_jobs:
        title = job.find('a', {'style': 'color: #000;'}).text
        print title
        date_raw = job.find('div',
                            {'class': 'col-xs-6 col-md-2 col-lg-2 lefttop'})
        date = date_raw.find('b').text.split()[0]
        print date
        price = job.find('div', {
            'class': 'col-xs-6 col-md-2 col-lg-2 leftbottom'
        }).text.split()[2]
        print price
        url = 'http://www.freelance.com' + job.find('a', {
            'style': 'color: #000;'
        }).get('href')
        print url

        if not job_exist(url):
            job_row = Job(title=unicode(title),
                          date=unicode(date),
                          price=price,
                          url=url,
                          category=category,
                          parse_date=datetime.now())
            session.add(job_row)

    session.commit()
Exemplo n.º 3
0
def parse_category(url, category):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    all_jobs = soup.findAll('article', {"class": "task task_list"})

    for job in all_jobs:

        # print job
        title = job.find("div", "task__title").text
        print "Title:\t", title
        
        url = 'http://freelansim.ru'+job.find("div", "task__title").find("a").get('href')
        print "Url:\t", url
        
        if not job_exist(url):
         
            date = job.find("span", "params__published-at").text.splitlines()
            date = str(date[0])
            print "Date:\t", date
            
            price_raw = job.find("div", "task__price")
            price = price_raw.find("span", "count")
            if price:
                price = price.text
            else: # if not exist, find another tag
                price = price_raw.find("span", "negotiated_price").text

            print "Price:\t", price
            #raw = job.contents
            # category = 'admin'
            
            text_page = urllib2.urlopen(url)
            text_soup = BeautifulSoup(text_page)
            text = text_soup.find('div', {'class': 'task__description'}).text
            
            text_length = 320
            text = (text[:text_length] + '..') if len(text) > text_length else text

            print text, "\n"

            job_row = Job(
                            title = unicode(title),
                            date = unicode(date),
                            price = price,
                            url = url,
                            category = category,
                            parse_date = datetime.now(),
                            description = text
            )
            session.add(job_row)
            session.commit()
def parse_category(url, category):
    page = urllib2.urlopen(url, context=ctx)
    soup = BeautifulSoup(page)
    all_jobs = soup.findAll('div', {'class': 'jobsearch-result-list'})

    for job in all_jobs:
        # print job
        title = job.find('a', {'style': 'color: #000;'}).text
        print title
        url = 'http://www.freelance.com' + job.find('a', {
            'style': 'color: #000;'
        }).get('href')
        print url

        if not job_exist(url):
            date_raw = job.find(
                'div', {'class': 'col-xs-6 col-md-2 col-lg-2 lefttop'})
            date = date_raw.find('b').text.split()[0]
            print date
            price = job.find('div', {
                'class': 'col-xs-6 col-md-2 col-lg-2 leftbottom'
            }).text.split()[2]
            print price
            text_page = urllib2.urlopen(url, context=ctx)
            text_soup = BeautifulSoup(text_page)
            text = text_soup.find('div', {
                'class': 'col-md-9 col-lg-9 description'
            }).text[11:]
            # text = job.find('div', { 'class': 'col-xs-12 col-md-4 col-lg-4 center'}).text

            text_length = 320
            text = (text[:text_length] +
                    '..') if len(text) > text_length else text
            print text

            print '=========\n\n'

            job_row = Job(title=unicode(title),
                          date=unicode(date),
                          price=price,
                          url=url,
                          category=category,
                          parse_date=datetime.now(),
                          description=text)
            session.add(job_row)
            session.commit()
Exemplo n.º 5
0
def parse_category(url, category):
    page = requests.get(url).content
    soup = BeautifulSoup(page, 'lxml')
    all_jobs = soup.findAll('div', {'class': 'row'})
    for job in all_jobs:
        title_raw = job.find('div', class_='col-sm-10')
        try:  # to get Title
            title = title_raw.find('h2').text
            url = 'https://www.weblancer.net' + title_raw.find('a').attrs['href']
            if not job_exist(url):
                text = title_raw.find('p').text
                try:  # to get date
                    date = job.find('span', class_='time_ago').attrs['data-timestamp']
                    # from IPython import embed; embed(); import sys; sys.exit()
                except AttributeError:
                    date = job.find('span', class_='time_ago').attr['data-timestamp']

                print(date)
                try:  # to get price
                    price = job.find('div', class_='amount').text
                except AttributeError:
                    price = None

                print('\nDate:', date, \
                      '\nTitle:', title, \
                      '\nText:', text, \
                      '\nPrice:', price, \
                      '\nURL:', url)
                job_row = Job(
                    title=title,
                    date=date,
                    price=price,
                    url=url,
                    category=category,
                    parse_date=datetime.now(),
                    description=text
                )
                session.add(job_row)
                session.commit()

            else:
                print(title)

        except:
            pass
Exemplo n.º 6
0
def parse_category(url, category):
    page = requests.get(url).content
    soup = BeautifulSoup(page, "html.parser")
    all_jobs = soup.findAll('article', {"class": "task task_list"})

    for job in all_jobs:
        title = job.find("div", "task__title").text
        url = 'http://freelance.habr.com' + job.find(
            "div", "task__title").find("a").get('href')

        if not job_exist(url):
            date = job.find("span", "params__published-at").text.splitlines()
            date = str(date[0])

            price_raw = job.find("div", "task__price")

            try:
                price = price_raw.find("span", "count").text
            except:
                price = price_raw.find("span", "negotiated_price").text

            text_page = requests.get(url).content
            text_soup = BeautifulSoup(text_page, "html.parser")
            text = text_soup.find('div', {'class': 'task__description'}).text

            text_length = 320
            text = (text[:text_length] +
                    '..') if len(text) > text_length else text

            #print(text, "\n")

            job_row = Job(title=title,
                          date=date,
                          price=price,
                          url=url,
                          category=category,
                          parse_date=datetime.now(),
                          description=text)

            session.add(job_row)
            session.commit()
Exemplo n.º 7
0
def parse_category(url, category):
    page = requests.get(url).content
    soup = BeautifulSoup(page, "html.parser")

    all_jobs = soup.find_all('tr')

    for job in all_jobs:
        a = job.find('a')
        title = a.text.strip()
        url = 'https://freelancehunt.com' + a.attrs['href']

        if not job_exist(url):
            text = job.find('p', {"style": "word-break: break-word"}).text.strip()
            date = int(job.attrs['data-published'])

            try:
                price = job.find('div', class_='text-green price with-tooltip').text.strip()
            except AttributeError:
                price = None

           #print('\nDate:', date,\
           #        '\nTitle:', title,\
           #        '\nText:', text,\
           #        '\nPrice:', price,\
           #        '\nURL:', url, '\n\n'
           #)

            job_row = Job(
                title=title,
                date=date,
                price=price,
                url=url,
                category=category,
                parse_date=datetime.now(),
                description=text
            )

            session.add(job_row)
            session.commit()
Exemplo n.º 8
0
def parse_category(url, category):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    all_jobs = soup.findAll('article', {"class": "task task_list"})

    for job in all_jobs:

        title = job.find("div", "task__title").text
        print "Title:\t", title
        url = 'http://freelansim.ru'+job.find("div", "task__title").find("a").get('href')
        print "Url:\t", url
        date = job.find("span", "params__published-at").text.splitlines()
        date = str(date[0]+' '+date[1])
        print "Date:\t", date
        price_raw = job.find("div", "task__price")
        price = price_raw.find("span", "count")
        if price:
            price = price.text
        else: # if not exist, find another tag
            price = price_raw.find("span", "negotiated_price").text

        print "Price:\t", price, "\n"
        #raw = job.contents
        # category = 'admin'

        if not job_exist(url):
            job_row = Job(
                            title = unicode(title),
                            date = unicode(date),
                            price = price,
                            url = url,
                            category = category,
                            parse_date = datetime.now()
            )
            session.add(job_row)

    session.commit()
Exemplo n.º 9
0
def parse_category(url, category):
    page = requests.get(url).content
    soup = BeautifulSoup(page, 'html.parser')

    all_jobs = soup.find_all('div', class_='row click_container-link set_href')

    for job in all_jobs:
        right = job.find('div', class_='col-sm-4 text-sm-right').find('span')

        try:
            if right.text.startswith('Закрыт'):
                continue

        except:
            pass

        a = job.find('div', class_='title').find('a')

        title = a.text.strip()
        url = 'https://www.weblancer.net' + a.attrs['href']

        if not job_exist(url):
            try:
                date = int(
                    right.find('span',
                               class_='time_ago').attrs['data-timestamp'])
            except:
                date = ''
            try:
                text = " ".join(
                    job.find('div',
                             class_='collapse').text.strip().split(" ")[:-1])
            except AttributeError:
                text = job.find('div',
                                class_='text_field text-inline').text.strip()

            text = text.replace("\n", " ")

            try:
                price = job.find(
                    'div',
                    class_='float-right float-sm-none title amount indent-xs-b0'
                ).find('span').text.strip()
            except AttributeError:
                price = None

        #print('\nDate:', date, \
        #        '\nTitle:', title, \
        #        '\nText:', text, \
        #        '\nPrice:', price, \
        #        '\nURL:', url
        #)

            job_row = Job(title=title,
                          date=date,
                          price=price,
                          url=url,
                          category=category,
                          parse_date=datetime.now(),
                          description=text)

            session.add(job_row)
            session.commit()