예제 #1
0
class GetIcdUrls(object):

    def __init__(self):
        self.output_collection = 'disease_categories'
        self.crawler = Crawler()
        self.mongo = MongoUtilities()

    def run(self):
        """
        Collects the urls of all the categories listed on this page:
        https://en.wikipedia.org/wiki/ICD-10
        :return: returns nothing. Inserts documents into disease_categories
        """
        # Get html content
        soup = self.crawler.get_html_content(self.crawler.icd_url)
        # Extract category and urls
        categories = self.crawler.get_icd_category_urls(soup)
        # Write to Mongo DB
        self.mongo.insert_documents(self.output_collection, categories)
예제 #2
0
class CreateMedicalCorpus(object):

    def __init__(self):
        # TODO: read the collection names from config file
        self.input_collection = 'disease_categories'
        self.output_collection = 'medical_corpus'
        self.mongo_utilities = MongoUtilities()
        self.crawler = Crawler()

    def get_all_urls(self):
        """
        Collects all the URLs from each disease category from disease_categories collection.
        The column 'crawled' is set to 'No' for all the documents.
        :return: returns nothing, inserts documents into medical_corpus
        """

        docs = self.mongo_utilities.get_all_documents(self.input_collection)
        for doc in docs:
            print doc['category']
            soup = self.crawler.get_html_content(doc["url"])
            urls = self.crawler.get_sub_category_urls(soup, doc['category'])
            self.mongo_utilities.insert_documents(self.output_collection, urls)

    def get_content(self):
        """
        Crawls the actual content for each disease in medical corpus.
        For each crawled document the column 'crawled' is set to 'Yes'
        :return: returns nothing, modifies documents in medical_corpus
        """

        docs = self.mongo_utilities.get_uncrawled_docs()
        count = 0
        for doc in docs:
            count += 1
            print count
            url = doc['url']
            text = ""
            soup = self.crawler.get_html_content(url)
            # Get all the paragraph tags
            for p_tag in soup.findAll('p'):
                text += '<para start>'+p_tag.text+'<para end>'

            self.mongo_utilities.update_document(self.output_collection, {
                '$set': {
                    'content': text,
                    'crawled': 'Yes'
                }
            }, doc['_id'])
예제 #3
0
 def __init__(self):
     self.output_collection = 'disease_categories'
     self.crawler = Crawler()
     self.mongo = MongoUtilities()
예제 #4
0
 def __init__(self):
     # TODO: read the collection names from config file
     self.input_collection = 'disease_categories'
     self.output_collection = 'medical_corpus'
     self.mongo_utilities = MongoUtilities()
     self.crawler = Crawler()