Пример #1
0
    def setUp(self):

        curdir = os.getcwd()
        if curdir.endswith('clprf'):
            self.pyserini_root = '../..'
        else:
            self.pyserini_root = '.'

        self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'

        # In the rare event there's a collision
        if os.path.exists(self.tmp):
            shutil.rmtree(self.tmp)

        os.mkdir(self.tmp)
        os.mkdir(f'{self.tmp}/runs')

        self.round3_runs = {
            'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt':
            'dfccc32efd58a8284ae411e5c6b27ce9',
        }

        download_url(
            'https://storage.googleapis.com/neuralresearcher_data/trec_covid/data/53/covidex.t5.final.txt',
            f'{self.tmp}/runs')

        for url in self.round3_runs:
            print(f'Verifying stored run at {url}...')
            filename = url.split('/')[-1]
            filename = re.sub(
                '\\?dl=1$', '',
                filename)  # Remove the Dropbox 'force download' parameter

            download_url(url, self.tmp, md5=self.round3_runs[url], force=True)
            self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
Пример #2
0
    def setUp(self):
        download_and_unpack_index('https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-abstract-2020-05-01.tar.gz')
        download_and_unpack_index('https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-full-text-2020-05-01.tar.gz')
        download_and_unpack_index('https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-paragraph-2020-05-01.tar.gz')

        download_url('https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round2/anserini.covid-r2.fusion1.txt.gz', 'runs')
        # from https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python
        with gzip.open('runs/anserini.covid-r2.fusion1.txt.gz', 'rb') as f_in:
            with open('runs/anserini.covid-r2.fusion1.txt', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
Пример #3
0
    def setUp(self):
        download_and_unpack_index(
            'https://www.dropbox.com/s/jdsc6wu0vbumpup/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1')

        download_and_unpack_index(
            'https://www.dropbox.com/s/ouvp7zyqsp9y9gh/lucene-index-cord19-full-text-2020-05-01.tar.gz?dl=1')

        download_and_unpack_index(
            'https://www.dropbox.com/s/e1118vjuf58ojt4/lucene-index-cord19-paragraph-2020-05-01.tar.gz?dl=1')

        download_url('https://www.dropbox.com/s/wqb0vhxp98g7dxh/anserini.covid-r2.fusion1.txt.gz?dl=1', 'runs')
        # from https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python
        with gzip.open('runs/anserini.covid-r2.fusion1.txt.gz', 'rb') as f_in:
            with open('runs/anserini.covid-r2.fusion1.txt', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
Пример #4
0
def main(args):
    print(args.url)
    contents = urllib.request.urlopen(args.url).read().decode('utf-8')
    pattern = None
    if args.dropbox:
        pattern = re.compile('https://www.dropbox.com/[^)]+')
    elif args.gitlab:
        pattern = re.compile('https://git.uwaterloo.ca/([^)]+).tar.gz')
        # See https://git.uwaterloo.ca/jimmylin/anserini-indexes/-/raw/master/README.md
        # Tricky pattern to write because some lines might have two GitLab URLs
    elif args.vault:
        pattern = re.compile('https://vault.cs.uwaterloo.ca/[^)]+')
    else:
        print(
            'Must specify one of --dropbox, --gitlab, --vault: type of link to check.'
        )
        exit(0)

    md5sum_pattern = re.compile('`([a-z0-9]{32})`')
    for line in contents.splitlines():
        match = pattern.search(line)
        if match:
            md5sum_match = md5sum_pattern.search(line)
            if md5sum_match:
                url = match.group()
                if args.vault:
                    if not url.endswith('/download'):
                        url = url + '/download'
                md5sum = md5sum_match.group(1)
                print(f'Downloading and verifying {url}')
                destination = download_url(url, '.', md5=md5sum)
                print(f'Finished downloading to {destination}, removing...')
                os.remove(destination)
Пример #5
0
    def setUp(self):
        self.runs = {
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.abstract.qq.bm25.txt':
            'd08d85c87e30d6c4abf54799806d282f',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.abstract.qdel.bm25.txt':
            'd552dff90995cd860a5727637f0be4d1',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.full-text.qq.bm25.txt':
            '6c9f4c09d842b887262ca84d61c61a1f',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.full-text.qdel.bm25.txt':
            'c5f9db7733c72eea78ece2ade44d3d35',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.paragraph.qq.bm25.txt':
            '872673b3e12c661748d8899f24d3ba48',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.paragraph.qdel.bm25.txt':
            'c1b966e4c3f387b6810211f339b35852',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.abstract.qq.bm25.txt':
            '56ac5a0410e235243ca6e9f0f00eefa1',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.abstract.qdel.bm25.txt':
            '115d6d2e308b47ffacbc642175095c74',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.full-text.qq.bm25.txt':
            'af0d10a5344f4007e6781e8d2959eb54',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.full-text.qdel.bm25.txt':
            '594d469b8f45cf808092a3d8e870eaf5',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.paragraph.qq.bm25.txt':
            '6f468b7b60aaa05fc215d237b5475aec',
            'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.paragraph.qdel.bm25.txt':
            'b7b39629c12573ee0bfed8687dacc743',
        }

        self.tmp = f'tmp{randint(0, 10000)}'

        # In the rare event there's a collision
        if os.path.exists(self.tmp):
            shutil.rmtree(self.tmp)

        os.mkdir(self.tmp)
        for url in self.runs:
            print(f'Verifying stored run at {url}...')
            filename = url.split('/')[-1]
            filename = re.sub(
                '\\?dl=1$', '',
                filename)  # Remove the Dropbox 'force download' parameter

            download_url(url, self.tmp, md5=self.runs[url], force=True)
            self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
            print('')
Пример #6
0
    def setUp(self):

        curdir = os.getcwd()
        if curdir.endswith('clprf'):
            self.pyserini_root = '../..'
        else:
            self.pyserini_root = '.'

        self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'

        # In the rare event there's a collision
        if os.path.exists(self.tmp):
            shutil.rmtree(self.tmp)

        os.mkdir(self.tmp)
        os.mkdir(f'{self.tmp}/runs')

        self.round5_runs = {
            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.1s.gz':
            '2181ae5b7fe8bafbd3b41700f3ccde02',
            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.2s.gz':
            'e61f9b6de5ffbe1b5b82d35216968154',
            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.2s.gz':
            '6e517a5e044d8b7ce983f7e165cf4aeb',
            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.1s.gz':
            'dc9b4b45494294a8448cf0693f07f7fd'
        }

        for url in self.round5_runs:
            print(f'Verifying stored run at {url}...')
            filename = url.split('/')[-1]
            filename = re.sub(
                '\\?dl=1$', '',
                filename)  # Remove the Dropbox 'force download' parameter
            gzip_filename = (".").join(filename.split('.')[:-1])

            download_url(url,
                         f'{self.tmp}/runs/',
                         md5=self.round5_runs[url],
                         force=True)
            self.assertTrue(
                os.path.exists(os.path.join(f'{self.tmp}/runs/', filename)))
            with gzip.open(f'{self.tmp}/runs/{filename}', 'rb') as f_in:
                with open(f'{self.tmp}/runs/{gzip_filename}', 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
    def check_runs(self, runs):
        tmp = f'tmp{randint(0, 10000)}'

        # In the rare event there's a collision
        if os.path.exists(tmp):
            shutil.rmtree(tmp)

        os.mkdir(tmp)
        for url in runs:
            print(f'Verifying stored run at {url}...')
            filename = url.split('/')[-1]
            filename = re.sub('\\?dl=1$', '', filename)  # Remove the Dropbox 'force download' parameter

            download_url(url, tmp, md5=runs[url], force=True)
            self.assertTrue(os.path.exists(os.path.join(tmp, filename)))
            print('')

        shutil.rmtree(tmp)
Пример #8
0
def check(index):
    for entry in index:
        print(f'# Checking "{entry}"...')
        md5sum = index[entry]['md5']
        for url in index[entry]['urls']:
            destination = download_url(url, '.', md5=md5sum)
            print(f'Finished downloading to {destination}, cleaning up.')
            os.remove(destination)
        print('\n')
Пример #9
0
    def setUp(self):
        self.runs = {
            'https://www.dropbox.com/s/g80cqdxud1l06wq/anserini.covid-r3.abstract.qq.bm25.txt?dl=1':
                'd08d85c87e30d6c4abf54799806d282f',
            'https://www.dropbox.com/s/sjcnxq7h0a3j3xz/anserini.covid-r3.abstract.qdel.bm25.txt?dl=1':
                'd552dff90995cd860a5727637f0be4d1',
            'https://www.dropbox.com/s/4bjx35sgosu0jz0/anserini.covid-r3.full-text.qq.bm25.txt?dl=1':
                '6c9f4c09d842b887262ca84d61c61a1f',
            'https://www.dropbox.com/s/mjt7y1ywae784d0/anserini.covid-r3.full-text.qdel.bm25.txt?dl=1':
                'c5f9db7733c72eea78ece2ade44d3d35',
            'https://www.dropbox.com/s/qwn7jd8vg2chjik/anserini.covid-r3.paragraph.qq.bm25.txt?dl=1':
                '872673b3e12c661748d8899f24d3ba48',
            'https://www.dropbox.com/s/2928i60fj2i09bt/anserini.covid-r3.paragraph.qdel.bm25.txt?dl=1':
                'c1b966e4c3f387b6810211f339b35852',
            'https://www.dropbox.com/s/mf79huhxfy96g6i/anserini.covid-r4.abstract.qq.bm25.txt?dl=1':
                '56ac5a0410e235243ca6e9f0f00eefa1',
            'https://www.dropbox.com/s/4zau6ejrkvgn9m7/anserini.covid-r4.abstract.qdel.bm25.txt?dl=1':
                '115d6d2e308b47ffacbc642175095c74',
            'https://www.dropbox.com/s/bpdopie6gqffv0w/anserini.covid-r4.full-text.qq.bm25.txt?dl=1':
                'af0d10a5344f4007e6781e8d2959eb54',
            'https://www.dropbox.com/s/rh0uy71ogbpas0v/anserini.covid-r4.full-text.qdel.bm25.txt?dl=1':
                '594d469b8f45cf808092a3d8e870eaf5',
            'https://www.dropbox.com/s/ifkjm8ff8g2aoh1/anserini.covid-r4.paragraph.qq.bm25.txt?dl=1':
                '6f468b7b60aaa05fc215d237b5475aec',
            'https://www.dropbox.com/s/keuogpx1dzinsgy/anserini.covid-r4.paragraph.qdel.bm25.txt?dl=1':
                'b7b39629c12573ee0bfed8687dacc743',
        }

        self.tmp = f'tmp{randint(0, 10000)}'

        # In the rare event there's a collision
        if os.path.exists(self.tmp):
            shutil.rmtree(self.tmp)

        os.mkdir(self.tmp)
        for url in self.runs:
            print(f'Verifying stored run at {url}...')
            filename = url.split('/')[-1]
            filename = re.sub('\\?dl=1$', '', filename)  # Remove the Dropbox 'force download' parameter

            download_url(url, self.tmp, md5=self.runs[url], force=True)
            self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
            print('')
Пример #10
0
    def setUp(self):

        curdir = os.getcwd()
        if curdir.endswith('integrations'):
            self.pyserini_root = '..'
        else:
            self.pyserini_root = '.'

        self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'

        # In the rare event there's a collision
        if os.path.exists(self.tmp):
            shutil.rmtree(self.tmp)

        os.mkdir(self.tmp)
        os.mkdir(f'{self.tmp}/runs')

        self.round4_runs = {
            'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt':
            'dfccc32efd58a8284ae411e5c6b27ce9',
            'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt':
            '7a5c27e8e052c49ff72d557051825973',
        }

        download_url(
            'https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz',
            f'{self.tmp}/runs')

        with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz',
                       'rb') as f_in:
            with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        for url in self.round4_runs:
            print(f'Verifying stored run at {url}...')
            filename = url.split('/')[-1]
            filename = re.sub(
                '\\?dl=1$', '',
                filename)  # Remove the Dropbox 'force download' parameter

            download_url(url, self.tmp, md5=self.round4_runs[url], force=True)
            self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
            print('')
Пример #11
0
    def setUp(self):
        download_and_unpack_index(
            'https://www.dropbox.com/s/wxjoe4g71zt5za2/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1'
        )

        download_and_unpack_index(
            'https://www.dropbox.com/s/di27r5o2g5kat5k/lucene-index-cord19-full-text-2020-05-01.tar.gz?dl=1'
        )

        download_and_unpack_index(
            'https://www.dropbox.com/s/6ib71scm925mclk/lucene-index-cord19-paragraph-2020-05-01.tar.gz?dl=1'
        )

        download_url(
            'https://www.dropbox.com/s/wqb0vhxp98g7dxh/anserini.covid-r2.fusion1.txt.gz?dl=1',
            'runs')
        # from https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python
        with gzip.open('runs/anserini.covid-r2.fusion1.txt.gz', 'rb') as f_in:
            with open('runs/anserini.covid-r2.fusion1.txt', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
Пример #12
0
def main(args):
    print(args.url)
    contents = urllib.request.urlopen(args.url).read().decode('utf-8')
    dropbox_pattern = re.compile('https://www.dropbox.com/[^)]+')
    md5sum_pattern = re.compile('`([a-z0-9]{32})`')
    for line in contents.splitlines():
        match = dropbox_pattern.search(line)
        if match:
            md5sum_match = md5sum_pattern.search(line)
            if md5sum_match:
                url = match.group()
                md5sum = md5sum_match.group(1)
                print(f'Downloading and verifying {url}')
                destination = download_url(url, '.', md5=md5sum)
                print(f'Finished downloading to {destination}, removing...')
                os.remove(destination)
Пример #13
0
 def download_kilt_topics(cls, task: str, force=False):
     if task not in KILT_QUERY_INFO:
         raise ValueError(f'Unrecognized query name {task}')
     task = KILT_QUERY_INFO[task]
     md5 = task['md5']
     save_dir = os.path.join(get_cache_home(), 'queries')
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
     for url in task['urls']:
         try:
             return download_url(url, save_dir, force=force, md5=md5)
         except (HTTPError, URLError) as e:
             print(
                 f'Unable to download encoded query at {url}, trying next URL...'
             )
     raise ValueError(
         f'Unable to download encoded query at any known URLs.')
Пример #14
0
def main():
    if not (os.path.isdir(indexes[0]) and os.path.isdir(indexes[1])
            and os.path.isdir(indexes[2])):
        print('Required indexes do not exist. Please download first.')

    os.system(
        'cat src/main/resources/topics-and-qrels/qrels.covid-round1.txt ' +
        'src/main/resources/topics-and-qrels/qrels.covid-round2.txt ' +
        '> src/main/resources/topics-and-qrels/qrels.covid-round12.txt')

    round3_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round3.txt'
    round2_cumulative_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round12.txt'
    round3_cumulative_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt'

    verify_stored_runs(stored_runs)
    perform_runs()
    perform_fusion()
    prepare_final_submissions(round2_cumulative_qrels)
    evaluate_runs(round2_cumulative_qrels, cumulative_runs)
    evaluate_runs(round3_cumulative_qrels, cumulative_runs)

    # Download the NIST post-processed runs.
    print('')
    download_url(
        'https://www.dropbox.com/s/ilqgky1tti0zvez/anserini.final-r3.fusion1.post-processed.txt?dl=1',
        'runs',
        force=True)
    download_url(
        'https://www.dropbox.com/s/ue3z6xxxca9krkb/anserini.final-r3.fusion2.post-processed.txt?dl=1',
        'runs',
        force=True)
    download_url(
        'https://www.dropbox.com/s/95vk831wp1ldnpm/anserini.final-r3.rf.post-processed.txt?dl=1',
        'runs',
        force=True)

    evaluate_runs(round3_qrels, final_runs)