Пример #1
0
class CreateMedicalCorpus(object):

    def __init__(self):
        # TODO: read the collection names from config file
        self.input_collection = 'disease_categories'
        self.output_collection = 'medical_corpus'
        self.mongo_utilities = MongoUtilities()
        self.crawler = Crawler()

    def get_all_urls(self):
        """
        Collects all the URLs from each disease category from disease_categories collection.
        The column 'crawled' is set to 'No' for all the documents.
        :return: returns nothing, inserts documents into medical_corpus
        """

        docs = self.mongo_utilities.get_all_documents(self.input_collection)
        for doc in docs:
            print doc['category']
            soup = self.crawler.get_html_content(doc["url"])
            urls = self.crawler.get_sub_category_urls(soup, doc['category'])
            self.mongo_utilities.insert_documents(self.output_collection, urls)

    def get_content(self):
        """
        Crawls the actual content for each disease in medical corpus.
        For each crawled document the column 'crawled' is set to 'Yes'
        :return: returns nothing, modifies documents in medical_corpus
        """

        docs = self.mongo_utilities.get_uncrawled_docs()
        count = 0
        for doc in docs:
            count += 1
            print count
            url = doc['url']
            text = ""
            soup = self.crawler.get_html_content(url)
            # Get all the paragraph tags
            for p_tag in soup.findAll('p'):
                text += '<para start>'+p_tag.text+'<para end>'

            self.mongo_utilities.update_document(self.output_collection, {
                '$set': {
                    'content': text,
                    'crawled': 'Yes'
                }
            }, doc['_id'])
Пример #2
0
def crawl(listen, verbose, outputfile, outputtype):
    c = Crawler.Crawler(urls=config.URLS)
    if listen:
        c.listener = True
    if verbose:
        c.verbose = True
    c.output_file = outputfile
    c.output_type = outputtype
    c.run()
Пример #3
0
class GetIcdUrls(object):

    def __init__(self):
        self.output_collection = 'disease_categories'
        self.crawler = Crawler()
        self.mongo = MongoUtilities()

    def run(self):
        """
        Collects the urls of all the categories listed on this page:
        https://en.wikipedia.org/wiki/ICD-10
        :return: returns nothing. Inserts documents into disease_categories
        """
        # Get html content
        soup = self.crawler.get_html_content(self.crawler.icd_url)
        # Extract category and urls
        categories = self.crawler.get_icd_category_urls(soup)
        # Write to Mongo DB
        self.mongo.insert_documents(self.output_collection, categories)
Пример #4
0
 def __init__(self):
     self.output_collection = 'disease_categories'
     self.crawler = Crawler()
     self.mongo = MongoUtilities()
Пример #5
0
import textract

if __name__ == '__main__':
    config = configparser.ConfigParser()
    config.read('conf.ini')

    downloads_path = config.get('general',
                                'downloads_path',
                                fallback='/tmp/downloads/')
    if not os.path.exists(downloads_path):
        os.makedirs(downloads_path)
    elif not os.path.isdir(downloads_path):
        print('ERROR: downloads_path parameter points to file!')
        sys.exit(1)

    crawler = Crawler(config, 'captions')
    crawler.get(config.get('captions', 'url'))

    captions_list = []

    for row in crawler.get_elements('tbody tr'):
        items = crawler.get_elements('td', root=row)
        filename = items[0].text
        print("Current Filename is {}".format(filename))
        url = crawler.get_attr('a', 'href', root=items[0])
        crawler.download(url, filename)

        convert_filename = filename.replace('.pdf', '.txt')
        os.system("pdftotext '%s' '%s'" %
                  (downloads_path + 'Captions/' + filename,
                   downloads_path + 'Captions/' + convert_filename))
Пример #6
0
 def __init__(self):
     # TODO: read the collection names from config file
     self.input_collection = 'disease_categories'
     self.output_collection = 'medical_corpus'
     self.mongo_utilities = MongoUtilities()
     self.crawler = Crawler()