Пример #1
0
    def _POST_query(self, qs, scopes):
        _q = []
        INT_FIELDS = set(['entrezgene', 'retired'])

        if not scopes:
            scopes = self.default_scopes

        for term in qs:
            #logging.debug("Term: {}".format(term))
            if is_int(term) and set(scopes).intersection(INT_FIELDS):
                _q.extend([
                    '{}',
                    json.dumps(
                        self._POST_single_query(
                            term,
                            scopes=list(set(scopes).intersection(INT_FIELDS))))
                ])
            elif not is_int(term) and set(scopes).difference(INT_FIELDS):
                _q.extend([
                    '{}',
                    json.dumps(
                        self._POST_single_query(
                            term,
                            scopes=list(set(scopes).difference(INT_FIELDS))))
                ])
            else:
                _q.extend(
                    ['{}',
                     json.dumps(self._POST_single_query(term=None))])
        return self._return_query_kwargs({'body': '\n'.join(_q)})
Пример #2
0
    def select_species(self):
        import tempfile
        outfile = tempfile.mktemp() + '.txt.gz'
        try:
            self.logger.info('Downloading "dataset_names.txt.gz"...')
            out_f = open(outfile, 'wb')
            ftp = FTP(self.__class__.ENSEMBL_FTP_HOST)
            ftp.login()
            species_file = '/pub/metazoa/release-%s/mysql/metazoa_mart_%s/dataset_names.txt.gz' % (
                self.release, self.release)
            ftp.retrbinary("RETR " + species_file, out_f.write)
            out_f.close()
            self.logger.info('Done.')

            #load saved file
            self.logger.info('Parsing "dataset_names.txt.gz"...')
            species_li = tab2list(outfile, (0, 4, 5), header=0)
            species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li]
            species_li = [
                x[:-1] + [is_int(x[-1]) and int(x[-1]) or None]
                for x in species_li
            ]
            self.logger.info('Done.')
        finally:
            os.remove(outfile)
            pass

        import pprint
        self.logger.error(pprint.pformat(species_li))
        return species_li
Пример #3
0
    def get_all_species(self):
        import tempfile
        outfile = tempfile.mktemp() + '.txt.gz'
        try:
            self.logger.info('Downloading "species.txt.gz"...')
            out_f = open(outfile, 'wb')
            ftp = FTP(self.__class__.ENSEMBL_FTP_HOST)
            ftp.login()
            species_file = '/pub/release-%s/mysql/ensembl_production_%s/species.txt.gz' % (self.release, self.release)
            ftp.retrbinary("RETR " + species_file, out_f.write)
            out_f.close()
            self.logger.info('Done.')

            #load saved file
            self.logger.info('Parsing "species.txt.gz"...')
            species_li = tab2list(outfile, (1, 2, 7), header=0)   # db_name,common_name,taxid
            species_li = [x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li]
            # as of ensembl 87, there are also mouse strains. keep only the "original" one
            species_li = [s for s in species_li if not s[0].startswith("mus_musculus_")]
            self.logger.info('Done.')
        finally:
            os.remove(outfile)
            pass

        return species_li
Пример #4
0
    def _select_species(self):
        """
        Return a list of tuple containing species to download data for.
        [(species_name1, common_name1, taxid1),(species_name2, common_name2, taxid2), ...]
        """
        import tempfile
        outfile = tempfile.mktemp() + '.txt.gz'
        try:
            self.logger.info('Downloading Species List...')
            out_f = open(outfile, 'wb')
            ftp = FTP(self.__class__.ENSEMBL_FTP_HOST)
            ftp.login()
            species_file = self.get_species_file()
            ftp.retrbinary("RETR " + species_file, out_f.write)
            out_f.close()
            self.logger.info('Done.')

            # load saved file
            self.logger.info('Loading Species List...')
            species_li = tab2list(outfile, (0, 4, 5), header=0)
            species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li]
            species_li = [
                x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li]
            self.logger.info('Done.')
        finally:
            os.remove(outfile)

        import pprint
        self.logger.debug('\n %s', pprint.pformat(species_li))
        return species_li
Пример #5
0
    def _dis_max_query(self, q):
        # remove '"' and '\' from q, they will break json decoder.
        q = q.replace('"', '').replace('\\', '')
        _query = {
            "dis_max": {
                "tie_breaker":
                0,
                "boost":
                1,
                "queries": [
                    {
                        "function_score": {
                            "query": {
                                "match": {
                                    "symbol": {
                                        "query": "%(q)s",
                                        "analyzer": "whitespace_lowercase"
                                    }
                                },
                            },
                            "weight": 5
                        }
                    },
                    {
                        "function_score": {
                            "query": {
                                # This makes phrase match of "cyclin-dependent
                                # kinase 2" appears first
                                "match_phrase": {
                                    "name": "%(q)s"
                                },
                            },
                            "weight": 4
                        }
                    },
                    {
                        "function_score": {
                            "query": {
                                "match": {
                                    "name": {
                                        "query": "%(q)s",
                                        "operator": "and",
                                        "analyzer": "whitespace_lowercase"
                                    }
                                },
                            },
                            "weight": 3
                        }
                    },
                    {
                        "function_score": {
                            "query": {
                                "match": {
                                    "unigene": {
                                        "query": "%(q)s",
                                        "analyzer": "string_lowercase"
                                    }
                                }
                            },
                            "weight": 1.1
                        }
                    },
                    {
                        "function_score": {
                            "query": {
                                "multi_match": {
                                    "query":
                                    "%(q)s",
                                    "fields": [
                                        'refseq.rna', 'refseq.protein',
                                        'accession.rna', 'accession.protein'
                                    ],
                                    "operator":
                                    "or"
                                }
                            },
                            "weight": 1.1
                        }
                    },
                    {
                        "function_score": {
                            "query": {
                                "match": {
                                    "go": {
                                        "query": "%(q)s",
                                        "analyzer": "string_lowercase"
                                    }
                                }
                            },
                            "weight": 1.1
                        }
                    },
                    # {
                    # "custom_boost_factor": {
                    #     "query" : {
                    #         "match" : { "_all" : {
                    #                     "query": "%(q)s",
                    #                     "analyzer": "whitespace_lowercase"
                    #             }
                    #         },
                    #     },
                    #     "boost_factor": 1
                    # }
                    # },
                    {
                        "function_score": {
                            "query": {
                                "query_string": {
                                    "query": "%(q)s",
                                    "default_operator": "AND",
                                    "auto_generate_phrase_queries": True
                                },
                            },
                            "weight": 1
                        }
                    },
                ]
            }
        }
        _query = json.dumps(_query)
        _query = json.loads(_query % {'q': q})

        if is_int(q):
            _query['dis_max']['queries'] = []
            _query['dis_max']['queries'].insert(
                0, {
                    "function_score": {
                        "query": {
                            "term": {
                                "entrezgene": int(q)
                            },
                        },
                        "weight": 8
                    }
                })

        return _query
Пример #6
0
def dismax(q):

    _query = {
        "tie_breaker":
        0,
        "boost":
        1,
        "queries": [
            {
                "function_score": {
                    "query": {
                        "match": {
                            "symbol": {
                                "query": q,
                                "analyzer": "whitespace_lowercase"
                            }
                        },
                    },
                    "weight": 5
                }
            },
            {
                "function_score": {
                    "query": {
                        # This makes phrase match of "cyclin-dependent
                        # kinase 2" appears first
                        "match_phrase": {
                            "name": q
                        },
                    },
                    "weight": 4
                }
            },
            {
                "function_score": {
                    "query": {
                        "match": {
                            "name": {
                                "query": q,
                                "operator": "and",
                                "analyzer": "whitespace_lowercase"
                            }
                        },
                    },
                    "weight": 3
                }
            },
            {
                "function_score": {
                    "query": {
                        "match": {
                            "unigene": {
                                "query": q,
                                "analyzer": "string_lowercase"
                            }
                        }
                    },
                    "weight": 1.1
                }
            },
            {
                "function_score": {
                    "query": {
                        "multi_match": {
                            "query":
                            q,
                            "fields": [
                                'refseq.rna', 'refseq.protein',
                                'accession.rna', 'accession.protein'
                            ],
                            "operator":
                            "or"
                        }
                    },
                    "weight": 1.1
                }
            },
            {
                "function_score": {
                    "query": {
                        "match": {
                            "go": {
                                "query": q,
                                "analyzer": "string_lowercase"
                            }
                        }
                    },
                    "weight": 1.1
                }
            },
            {
                "function_score": {
                    "query": {
                        "query_string": {
                            "query": q,
                            "default_operator": "AND",
                            "auto_generate_phrase_queries": True
                        },
                    },
                    "weight": 1
                }
            }
        ]
    }

    if is_int(q):
        _query['queries'] = [{
            "function_score": {
                "query": {
                    "term": {
                        "entrezgene": int(q)
                    },
                },
                "weight": 8
            }
        }]

    return {"query": {"dis_max": _query}}