def test_gene_pubs(self):
        ''' Check the difference between the pubs indexed and those from the gene_pub file
        from the NCBI. If the publication pipeline has not been run recently there is likely
        to be a difference. This is allowed for with the NUM_DIFF variable. If there is a
        larger difference than this then the publication pipeline should be run. '''
        ini = IniParser()
        config = ini.read_ini('publications.ini')
        section = config['GENE']

        file_name = 'gene_pub_test.tmp'
        download_file = os.path.join(DiseasePublicationTest.TEST_DATA_DIR, file_name)
        success = FTPDownload().download(urljoin(section['location'], section['files']),
                                         DiseasePublicationTest.TEST_DATA_DIR, file_name=file_name)
        self.assertTrue(success, 'downloaded gene publications file')

        pmids = set()
        with gzip.open(download_file, 'rt') as outf:
            seen_add = pmids.add
            for x in outf:
                if not x.startswith('9606\t'):
                    continue
                pmid = re.split('\t', x)[2].strip()
                if pmid not in pmids:
                    seen_add(pmid)
        pmids = list(pmids)
        elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))),
                                                   sources=['pmid']),
                         idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2)
        self.assertLess(len(pmids)-elastic.get_count()['count'], GenePublicationTest.NUM_DIFF,
                        'Count for gene publications')

        # check for differences in pmids
#         pmids_in_idx = []
#
#         def get_pmids(resp_json):
#             hits = resp_json['hits']['hits']
#             pmids_in_idx.extend([getattr(Document(h), "pmid") for h in hits])
#
#         ScanAndScroll.scan_and_scroll(idx=ElasticSettings.idx('PUBLICATION'), call_fun=get_pmids,
#                                       query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))),
#                                                          sources=['pmid']),
#                                       time_to_keep_scoll=30)
#         pmids_diff = list(set(pmids) - set(pmids_in_idx))
#         self.assertLess(len(pmids_diff), GenePublicationTest.NUM_DIFF)
        os.remove(download_file)
    def setUpClass(cls):
        ''' Retrieve the publication list for each disease from NCBI. '''
        ini = IniParser()
        config = ini.read_ini('publications.ini')
        res = Search(ElasticQuery(Query.match_all(), sources=['code']), idx=ElasticSettings.idx('DISEASE')).search()
        sections = ''
        for doc in res.docs:
            sections += 'DISEASE::'+getattr(doc, 'code').upper()+','
        # sections = 'DISEASE::T1D,DISEASE::MS,DISEASE::SLE'

        # download ncbi publication lists for each disease
        for section_name in config.sections():
            if sections is not None and not ini._is_section_match(section_name, sections):
                continue
            ini._inherit_section(section_name, config)
            logger.debug(section_name)
            section = config[section_name]
            disease = section_name.split('::')[1]
            file_name = 'disease_pub_'+disease+'.tmp'
            HTTPDownload().download(section['location']+"?"+section['http_params'],
                                    cls.TEST_DATA_DIR, file_name=file_name)
            DiseasePublicationTest.DISEASES.append(disease)
        print()
Exemplo n.º 3
0
    def get_criteria_config(cls, ini_file='criteria.ini'):
        '''function to build the criteria config
        '''
        BASE_DIR = os.path.dirname(os.path.dirname(__file__))

        if 'test' in ini_file:
            ini_file = os.path.join(BASE_DIR, 'test', ini_file)
        else:
            ini_file = os.path.join(BASE_DIR, ini_file)

        config = None
        if os.path.isfile(ini_file):
            config = IniParser.read_ini(cls, ini_file=ini_file)

        return config
Exemplo n.º 4
0
    def tag_feature_to_all_diseases(cls, feature_id, section, config, result_container={}):
        ''' function to tag the feature to all the diseases, used to tag features in the MHC region
        @type  feature_id: string
        @keyword feature_id: Id of the feature (gene => gene_id, region=>region_id)
        @type  section: string
        @keyword section: The section in the criteria.ini file
        @type  config:  string
        @keyword config: The config object initialized from criteria.ini.
        @type result_container : string
        @keyword result_container: Container object for storing the result with keys as the feature_id
        '''
#         (main_codes, other_codes) = CriteriaManager.get_available_diseases()
#         all_diseases = main_codes + other_codes

        result_container_ = result_container
        if config is None:
            config = IniParser.read_ini(ini_file='criteria.ini')

        dis_dict = dict()
        criteria_disease_dict = {}

        for disease in cls.site_enabled_diseases:
                dis_dict[disease] = []
                criteria_dict = cls.get_criteria_dict(disease, disease)
                if len(result_container_.get(feature_id, {})) > 0:

                    criteria_disease_dict = result_container_[feature_id]
                    criteria_disease_dict = cls.get_criteria_disease_dict([disease], criteria_dict,
                                                                          criteria_disease_dict)

                    result_container_[feature_id] = criteria_disease_dict
                else:
                    criteria_disease_dict = {}
                    criteria_disease_dict = cls.get_criteria_disease_dict([disease], criteria_dict,
                                                                          criteria_disease_dict)
                    result_container_[feature_id] = criteria_disease_dict

        return result_container_