Python Crawler примеры использования

Язык программирования: Python

Пространство имен/Пакет: utils

Класс/Тип: Crawler

Примеров на hotexamples.com: 6

Python Crawler - 6 примеров найдено. Это лучшие примеры Python кода для utils.Crawler, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Crawler(4)

get(3)

get_html_content(2)

close(1)

download(1)

get_attr(1)

get_elements(1)

get_icd_category_urls(1)

get_sub_category_urls(1)

Пример #1

Показать файл

Файл: create_medical_corpus.py Проект: azhar3339/NLP

class CreateMedicalCorpus(object):

    def __init__(self):
        # TODO: read the collection names from config file
        self.input_collection = 'disease_categories'
        self.output_collection = 'medical_corpus'
        self.mongo_utilities = MongoUtilities()
        self.crawler = Crawler()

    def get_all_urls(self):
        """
        Collects all the URLs from each disease category from disease_categories collection.
        The column 'crawled' is set to 'No' for all the documents.
        :return: returns nothing, inserts documents into medical_corpus
        """

        docs = self.mongo_utilities.get_all_documents(self.input_collection)
        for doc in docs:
            print doc['category']
            soup = self.crawler.get_html_content(doc["url"])
            urls = self.crawler.get_sub_category_urls(soup, doc['category'])
            self.mongo_utilities.insert_documents(self.output_collection, urls)

    def get_content(self):
        """
        Crawls the actual content for each disease in medical corpus.
        For each crawled document the column 'crawled' is set to 'Yes'
        :return: returns nothing, modifies documents in medical_corpus
        """

        docs = self.mongo_utilities.get_uncrawled_docs()
        count = 0
        for doc in docs:
            count += 1
            print count
            url = doc['url']
            text = ""
            soup = self.crawler.get_html_content(url)
            # Get all the paragraph tags
            for p_tag in soup.findAll('p'):
                text += '<para start>'+p_tag.text+'<para end>'

            self.mongo_utilities.update_document(self.output_collection, {
                '$set': {
                    'content': text,
                    'crawled': 'Yes'
                }
            }, doc['_id'])

Пример #2

Показать файл

Файл: TapzRSS.py Проект: Naelpuissant/TapzRSS

def crawl(listen, verbose, outputfile, outputtype):
    c = Crawler.Crawler(urls=config.URLS)
    if listen:
        c.listener = True
    if verbose:
        c.verbose = True
    c.output_file = outputfile
    c.output_type = outputtype
    c.run()

Пример #3

Показать файл

Файл: get_icd_urls.py Проект: azhar3339/NLP

class GetIcdUrls(object):

    def __init__(self):
        self.output_collection = 'disease_categories'
        self.crawler = Crawler()
        self.mongo = MongoUtilities()

    def run(self):
        """
        Collects the urls of all the categories listed on this page:
        https://en.wikipedia.org/wiki/ICD-10
        :return: returns nothing. Inserts documents into disease_categories
        """
        # Get html content
        soup = self.crawler.get_html_content(self.crawler.icd_url)
        # Extract category and urls
        categories = self.crawler.get_icd_category_urls(soup)
        # Write to Mongo DB
        self.mongo.insert_documents(self.output_collection, categories)

Пример #4

Показать файл

Файл: get_icd_urls.py Проект: azhar3339/NLP

 def __init__(self):
     self.output_collection = 'disease_categories'
     self.crawler = Crawler()
     self.mongo = MongoUtilities()

Пример #5

Показать файл

import textract

if __name__ == '__main__':
    config = configparser.ConfigParser()
    config.read('conf.ini')

    downloads_path = config.get('general',
                                'downloads_path',
                                fallback='/tmp/downloads/')
    if not os.path.exists(downloads_path):
        os.makedirs(downloads_path)
    elif not os.path.isdir(downloads_path):
        print('ERROR: downloads_path parameter points to file!')
        sys.exit(1)

    crawler = Crawler(config, 'captions')
    crawler.get(config.get('captions', 'url'))

    captions_list = []

    for row in crawler.get_elements('tbody tr'):
        items = crawler.get_elements('td', root=row)
        filename = items[0].text
        print("Current Filename is {}".format(filename))
        url = crawler.get_attr('a', 'href', root=items[0])
        crawler.download(url, filename)

        convert_filename = filename.replace('.pdf', '.txt')
        os.system("pdftotext '%s' '%s'" %
                  (downloads_path + 'Captions/' + filename,
                   downloads_path + 'Captions/' + convert_filename))

Пример #6

Показать файл

Файл: create_medical_corpus.py Проект: azhar3339/NLP

 def __init__(self):
     # TODO: read the collection names from config file
     self.input_collection = 'disease_categories'
     self.output_collection = 'medical_corpus'
     self.mongo_utilities = MongoUtilities()
     self.crawler = Crawler()