Пример #1
0
def schema_loci(schema_uri, headers_get):
    """ Retrieves the list of loci for a schema.

        Parameters
        ----------
        schema_uri : str
            The URI of the schema in the Chewie-NS.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        loci : dict
            A dictionary with loci URIs as keys and
            loci names as values.
    """

    # get the list of loci
    loci_uri = aux.make_url(schema_uri, 'loci')
    loci_res = requests.get(loci_uri, headers=headers_get, verify=False)
    loci_res = loci_res.json()['Loci']

    # locus URI to locus name
    loci = {}
    for locus in loci_res:
        loci[str(locus['locus']['value'])] = locus['name']['value']

    return loci
Пример #2
0
def download_ptf(ptf_hash, download_folder, schema_id, species_id,
                 species_name, headers_get, base_url):
    """ Downloads the Prodigal training file for a schema.

        Parameters
        ----------
        ptf_hash : str
            Unique identifier of the Prodigal training file
            (BLAKE2 hash).
        download_folder : str
            Path to the directory to which the Prodigal
            training file should be saved.
        schema_id : str
            The identifier of the schema in the Chewie-NS.
        species_id : str
            The identifier of the schema's species in the
            Chewie-NS.
        species_name : str
            Scientific name of the schema species.
        headers_get : dict
            HTTP headers for GET requests.
        base_url : str
            Base URL of the Chewie Nomenclature server.

        Returns
        -------
        ptf_file : str
            Path to the Prodigal training file.
    """

    ptf_uri = aux.make_url(
        base_url, *['species', species_id, 'schemas', schema_id, 'ptf'])

    ptf_response = requests.get(ptf_uri, headers=headers_get, verify=False)

    ptf_file = os.path.join(download_folder,
                            '{0}.trn'.format(species_name.replace(' ', '_')))

    open(ptf_file, 'wb').write(ptf_response.content)

    return ptf_file
Пример #3
0
def main(schema_dir, core_num, base_url, submit):

    # get ns configs
    local_date, schema_uri = aux.read_configs(schema_dir, '.ns_config')
    # get schema and species identifiers
    schema_id = schema_uri.split('/')[-1]
    species_id = schema_uri.split('/')[-3]
    if base_url is None:
        base_url = schema_uri.split('species/')[0]

    if submit is True and 'tutorial' not in base_url:
        print('\nOnly authorized registered users may submit new alleles.')
        token = aux.capture_login_credentials(base_url)
    else:
        token = ''

    # GET request headers
    headers_get = cnst.HEADERS_GET_JSON
    headers_get['Authorization'] = token

    # determine current user ID and Role
    if submit is True and 'tutorial' not in base_url:
        user_id, user_role, user_auth = aux.user_info(base_url, headers_get)
        # verify if user has authorization to submit
        url = aux.make_url(base_url, 'auth', 'check')
        response = requests.get(url, headers=headers_get)
        if response.status_code is 200:
            user_auth = True
        else:
            sys.exit(
                'Current user has no authorization to submit novel alleles.\n'
                'You can request authorization to submit novel alleles by sending '
                'an e-mail to: [email protected]')
        print('User id: {0}'.format(user_id))
        print('User role: {0}\n'.format(user_role))
    else:
        user_id = ''
        user_role = ''
        user_auth = True if 'tutorial' in base_url else False

    start_date = dt.datetime.now()
    start_date_str = dt.datetime.strftime(start_date, '%Y-%m-%dT%H:%M:%S')
    print('Started at: {0}\n'.format(start_date_str))

    # POST requests headers
    headers_post = cnst.HEADERS_POST_JSON
    headers_post['Authorization'] = token
    headers_post['user_id'] = user_id
    # POST headers to send binary data
    headers_post_bytes = cnst.HEADERS_POST
    headers_post_bytes['Authorization'] = token
    headers_post_bytes['user_id'] = user_id

    schema_params = aux.read_configs(schema_dir, '.schema_config')

    # verify that local configs have a single value per parameter
    if all([
            len(schema_params[k]) == 1 for k in schema_params
            if k != 'chewBBACA_version'
    ]) is not True:
        sys.exit('Cannot sync schema with multiple values per parameter.')

    # check if schema exists in the NS
    schema_name, ns_params = aux.get_species_schemas(schema_id, species_id,
                                                     base_url, headers_get)[2:]

    # verify that local configs match NS configs
    if all([
            str(schema_params[k][0]) == ns_params[k]['value']
            for k in schema_params if k != 'chewBBACA_version'
    ]) is not True:
        sys.exit('Local configs do not match Chewie-NS configs.')

    # Get the name of the species from the provided id
    # or vice-versa
    species_id, species_name = aux.species_ids(species_id, base_url,
                                               headers_get)

    print('Schema id: {0}'.format(schema_id))
    print('Schema name: {0}'.format(schema_name))
    print("Schema's species: {0} (id={1})".format(species_name, species_id))
    print('Last synced: {0}'.format(local_date))

    # get last modification date
    # setting syncing date to last modification date will allow
    # all users to sync even when the schema is locked and being
    # updated by another user
    ns_date = ns_params['last_modified']['value']
    print('\nRemote schema was last modified on: {0}'.format(ns_date))

    # exit if remote schema has not been updated since last
    # sync date and current user does not wish to submit new alleles
    if local_date == ns_date and submit is False:
        sys.exit('\nRemote schema has not been updated since last sync '
                 'process. Local schema is up-to-date.')

    # Create a temporary dir for the new alleles
    temp_dir = os.path.join(os.path.dirname(schema_dir), 'temp')
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)

    # retrieve alleles added to schema after last sync date
    print('\nRetrieving alleles added to remote schema '
          'after {0}...'.format(local_date))
    loci_alleles, server_time, count = retrieve_latest(local_date, schema_uri,
                                                       headers_get, ns_date)

    print('Retrieved {0} alleles for {1} loci.'
          ''.format(count, len(loci_alleles)))

    # Get schema files from genes list file
    genes_list = os.path.join(schema_dir, '.genes_list')
    with open(genes_list, 'rb') as gl:
        genes = pickle.load(gl)

    # update loci structure
    not_in_ns, pickled_loci, \
        updated, not_update, \
        rearranged = update_loci_files(loci_alleles, genes,
                                       schema_dir, temp_dir)

    total_local = sum([len(v[0]) for k, v in not_in_ns.items()])
    print('Local schema has {0} novel alleles for {1} '
          'loci.'.format(total_local, len(not_in_ns)))

    # check if there are any changes to make
    if len(pickled_loci) == 0:
        shutil.rmtree(temp_dir)
        sys.exit('Remote schema has not been altered and local schema '
                 'does not have novel alleles.')

    results = {}
    attributed = 0
    if submit is True and user_auth is True and len(not_in_ns) > 0:

        # attempt to lock schema
        lock_res = aux.simple_post_request(
            base_url, headers_post,
            ['species', species_id, 'schemas', schema_id, 'lock'],
            {'action': 'lock'})
        # if schema is already locked user cannot send alleles
        lock_status = lock_res.status_code
        if lock_status == 403:
            print('Schema is already locked. Another user might be updating '
                  'the schema. Please repeat the syncing process after a '
                  'while to add your new alleles to the Chewie-NS.\n The '
                  'process will now update your local schema with the alleles '
                  'retrieved from the Chewie-NS.')
        else:

            # after locking, check if date matches ns_date
            date_res = aux.simple_get_request(
                base_url, headers_get,
                ['species', species_id, 'schemas', schema_id, 'modified'])

            date_value = (date_res.json()).split(' ')[-1]

            if date_value != ns_date:
                print('Data retrieved from the Chewie-NS has an older '
                      'timestamp than current schema timestamp. Schema '
                      'might have been updated before this syncing process. '
                      'Please repeat the syncing process in order to add '
                      'your new alleles to the schema. The process will now '
                      'update your local schema with the alleles retrieved '
                      'from the Chewie-NS.')

                # unlock schema
                lock_res = aux.simple_post_request(
                    base_url, headers_post,
                    ['species', species_id, 'schemas', schema_id, 'lock'],
                    {'action': 'unlock'})
            else:
                print(
                    'Collecting data and creating files to submit local alleles...'
                )
                # get list of loci for schema in the NS
                loci_res = aux.simple_get_request(
                    base_url, headers_get,
                    ['species', species_id, 'schemas', schema_id, 'loci'])
                # get loci files names from response
                for l in loci_res.json()['Loci']:
                    locus_name = l['name']['value'] + '.fasta'
                    locus_uri = l['locus']['value']
                    if locus_name in not_in_ns:
                        not_in_ns[locus_name].append(locus_uri)

                # create files with length values to update
                length_files = create_lengths_files(not_in_ns, temp_dir)

                # create new alleles data
                alleles_files, \
                    loci_ids, \
                    loci_names = create_alleles_files(not_in_ns, base_url,
                                                      user_id, species_name,
                                                      species_id, schema_id,
                                                      temp_dir)

                # compress files with new alleles
                zipped_files = [
                    '{0}.zip'.format(file) for file in alleles_files
                ]
                list(map(aux.file_zipper, alleles_files, zipped_files))
                alleles_data = list(zip(zipped_files, loci_ids, loci_names))

                print('Sending and inserting new alleles...')
                failed, \
                    start_count = upload_alleles_data(alleles_data, length_files,
                                                      base_url, headers_post,
                                                      headers_post_bytes, species_id,
                                                      schema_id)

                # track progress through endpoint
                # set time limit for task completion (seconds)
                print()
                time_limit = 2100
                current_time = 0
                status = 'Updating'
                start_count = int(start_count.json()['nr_alleles'])
                while status != 'Complete' and (current_time < time_limit):
                    insertion_status = aux.simple_get_request(
                        base_url, headers_get, [
                            'species', species_id, 'schemas', schema_id,
                            'loci', 'locus', 'update'
                        ])
                    insertion_status = insertion_status.json()
                    if 'message' in insertion_status:
                        status = 'Complete'
                        results = insertion_status['identifiers']

                    current_count = int(insertion_status['nr_alleles'])

                    inserted = current_count - start_count
                    print('\r',
                          '    Inserted {0} alleles.'.format(inserted),
                          end='')
                    time.sleep(2)
                    current_time += 2

                if current_time != time_limit:
                    # determine alleles that were attributed an identifier
                    repeated = sum([len(r[0]) for l, r in results.items()])
                    attributed = sum([len(r[1]) for l, r in results.items()])

                    print(
                        '\nThe Chewie-NS inserted {0} new alleles and detected '
                        '{1} repeated alleles.'.format(attributed, repeated))
                else:
                    print(
                        '\nCould not retrieve allele identifiers assigned by '
                        'Chewie-NS. Will adapt schema with retrieved alleles. '
                        'Please repeat the syncing process in order to assign '
                        'the new identifiers for the submitted alleles.')

                # remove files in temp folder
                aux.remove_files(length_files)
                aux.remove_files(alleles_files)
                aux.remove_files(zipped_files)

    # change pickled files to FASTA files
    for locus, pick in pickled_loci.items():
        rearranged = pickle_to_fasta(locus, pick, temp_dir, results,
                                     rearranged)

    # change identifiers in SQLite DB
    if len(rearranged) > 0:
        print('\nUpdating local allele identifiers...')
        altered = sq.update_profiles(schema_dir, rearranged)
        if altered is not None:
            print('Updated {0} profiles.\n'.format(altered))
        else:
            print('Could not find local SQLite database to upload profiles.\n')

    # Re-determine the representative sequences
    if attributed > 0 or count > 0:
        PrepExternalSchema.main(temp_dir, schema_dir, core_num,
                                float(schema_params['bsr'][0]),
                                int(schema_params['minimum_locus_length'][0]),
                                11, '', None)

        # delete invalid alleles and genes files
        parent_dir = os.path.dirname(schema_dir)
        files = [
            os.path.join(parent_dir, file) for file in os.listdir(parent_dir)
            if 'invalid' in file
        ]

        for f in files:
            os.remove(f)

        # get last modification date
        last_modified = aux.simple_get_request(
            base_url, headers_get,
            ['species', species_id, 'schemas', schema_id, 'modified'])
        last_modified = (last_modified.json()).split(' ')[-1]
        server_time = last_modified

        # update NS config file with latest server time
        ns_configs = os.path.join(schema_dir, '.ns_config')
        with open(ns_configs, 'wb') as nc:
            pickle.dump([server_time, schema_uri], nc)

    print('Received {0} new alleles for {1} loci and sent '
          '{2} for {3} loci. '.format(count, len(pickled_loci), attributed,
                                      len(not_in_ns)))

    # delete temp directory
    shutil.rmtree(temp_dir)

    # delete pre-computed BSR values from 'short' directory
    # representatives might have changed and BSR values are outdated
    short_dir = os.path.join(schema_dir, 'short')
    bsr_files = [f for f in os.listdir(short_dir) if f.endswith('_bsr.txt')]
    for f in bsr_files:
        os.remove(os.path.join(short_dir, f))

    end_date = dt.datetime.now()
    end_date_str = dt.datetime.strftime(end_date, '%Y-%m-%dT%H:%M:%S')

    delta = end_date - start_date
    minutes, seconds = divmod(delta.total_seconds(), 60)

    print('\nFinished at: {0}'.format(end_date_str))
    print('Elapsed time: {0:.0f}m{1:.0f}s'.format(minutes, seconds))
Пример #4
0
def retrieve_alleles(loci_new_alleles, server_time, schema_uri, count,
                     headers_get, ns_date):
    """ Retrieves alleles added to a schema in the Chewie-NS
        during a time interval, up to the maximum number of
        alleles that the server returns at a time (50000).

        Parameters
        ----------
        loci_new_alleles : dict
            A dictionary with loci identifiers as keys and
            dictionaries with alleles identifiers and DNA
            sequences as values.
        server_time : str
            The function will return alleles added to the
            schema after this date (format Y-%m-%dT%H:%M:%S).
        schema_uri : str
            The URI of the schema in the Chewie-NS.
        count : int
            The cumulative number of sequences that has been
            returned.
        headers_get : dict
            HTTP headers for GET requests.
        ns_date : str
            The function will return alleles added to the
            schema up to this date (format Y-%m-%dT%H:%M:%S).

        Returns
        -------
        A list with the following variables:

        loci_new_alleles :  dict
            Input `loci_new_alleles` dictionary with alleles
            returned in the current and previous iterations.
        server_time : str
            The date of insertion of the last allele returned by
            the Chewie-NS.
        count : int
            The cumulative number of sequences that has been
            returned.
    """

    # request the new alleles starting on the date given
    url = aux.make_url(schema_uri, 'loci')
    payload = {'local_date': server_time, 'ns_date': ns_date}
    # get the new alleles
    response = requests.get(url,
                            headers=headers_get,
                            timeout=30,
                            params=payload)

    response_content = response.json()
    # get info about sequences that were added since last date
    new_alleles = response_content['newAlleles']

    # get headers info
    response_headers = response.headers
    if len(new_alleles) > 0:
        # get date of last added allele
        server_time = response_headers['Last-Allele']
        # group retrieved alleles by locus
        for allele in new_alleles:
            locus = '{0}{1}'.format(allele['locus_name']['value'], '.fasta')
            allele_id = allele['allele_id']['value']
            sequence = allele['nucSeq']['value']

            loci_new_alleles[locus][allele_id] = sequence

        # keep count of the number of retrieved alleles
        count += len(new_alleles)
    else:
        # get current server date if no alleles were added
        # since last server time
        server_time = response_headers['Server-Date']

    return (loci_new_alleles, server_time, count)
Пример #5
0
def upload_alleles_data(alleles_data, length_files, base_url, headers_post,
                        headers_post_bytes, species_id, schema_id):
    """ Uploads files with the data to insert alleles and the
        length values for the sequences of each locus.

        Parameters
        ----------
        alleles_data : list
            List with tuples, one per locus, that contain the path
            to the ZIP archive with the data to insert alleles,
            the identifier of the locus, the locus file hash and
            the basename of the locus file.
        length_files : list
            List with paths to the pickled files that contain a
            dictionary with sequences hashes as keys and sequence
            length as values.
        base_url : str
            Base URL of the Nomenclature server.
        headers_post : dict
            HTTP headers for POST requests that accept JSON
            formatted data.
        headers_post_bytes : dict
            HTTP headers for POST requests that support file
            upload.
        species_id : int
            The identifier of the schema's species in the NS.
        schema_id : int
            The identifier of the schema in the NS.

        Returns
        -------
        failed : list of str
            List with the identifiers of the loci whose alleles
            data could not be fully uploaded.
        zip_res : dict
            A dictionary with the response returned by the last
            POST method. It has loci identifiers as keys and
            lists with two dictionaries as values (the dictionaries
            have sequences hashes as keys and sequence identifiers in
            the Chewie-NS as values. The first dictionary has the hashes
            of the sequences that were sent to the Chewie-NS but that were
            already present in the loci and the identifiers of those repeated
            alleles that were sent to the Chewie-NS. The second dictionary
            has the same structure but for the sequences that were accepted and
            inserted into each locus).
    """

    uploaded = 0
    failed = []
    for i, a in enumerate(alleles_data):

        locus_id = a[1]

        # get length of alleles from current locus
        current_len = length_files[i]
        data = aux.pickle_loader(current_len)
        data = {locus_id: data[next(iter(data))]}
        data = json.dumps({'content': data})

        # send data to the NS
        send_url = aux.make_url(base_url, 'species', species_id, 'schemas',
                                schema_id, 'loci', locus_id, 'lengths')

        lengths_res = aux.upload_data(data, send_url, headers_post, False)
        length_status = lengths_res.status_code

        # get path to ZIP archive with data to insert alleles
        current_zip = a[0]

        # send data to insert alleles in the NS
        zip_url = aux.make_url(base_url, 'species', species_id, 'schemas',
                               schema_id, 'loci', locus_id, 'update')

        if alleles_data[i] == alleles_data[-1]:
            headers_post_bytes['complete'] = 'True'

        zip_res = aux.upload_file(current_zip, os.path.basename(current_zip),
                                  zip_url, headers_post_bytes, False)

        # determine if upload was successful
        zip_status = zip_res.status_code

        # determine if upload was successful
        if length_status not in [200, 201] or zip_status not in [200, 201]:
            failed.append(locus_id)
        elif length_status in [200, 201] and zip_status in [200, 201]:
            uploaded += 1
            print('\r',
                  '    Sent data for alleles of '
                  '{0}/{1} loci.'.format(uploaded, len(alleles_data)),
                  end='')

    return [failed, zip_res]
Пример #6
0
def download_fastas(loci, download_folder, headers_get, schema_date):
    """ Downloads and writes FASTA files for the loci of a
        schema in the Chewie-NS

        Parameters
        ----------
        loci : dict
            A dictionary with loci URIs as keys and
            loci names as values.
        download_folder : str
            Path to the directory where the FASTA files
            will be created.
        headers_get : dict
            HTTP headers for GET requests.
        schema_date : str
            The function will only retrieve alleles
            that were inserted up to this date.

        Returns
        -------
        ns_files : list
            List with the paths to the schema's FASTA
            files that were created.
    """

    # Total number of loci
    total_loci = len(loci)
    print('Number of loci to download: {0}'.format(total_loci))

    # build the list of urls to get
    fasta_urls = [aux.make_url(locus, 'fasta') for locus in loci]

    # multithread the requests
    print('Downloading schema files...')
    total = 0
    failed = []
    downloaded = 0
    ns_files = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for result in executor.map(get_fasta_seqs, fasta_urls,
                                   repeat(headers_get), repeat(schema_date)):
            locus_id = loci[result[0]]
            locus_info = result[1]
            if locus_info.status_code in [200, 201]:
                locus_file = build_fasta(locus_id, locus_info, download_folder)
                ns_files.append(locus_file)
                downloaded += 1
            else:
                failed.append(locus_id)
            total += 1
            print('\r',
                  'Downloaded: '
                  '{0}/{1}'.format(downloaded, total_loci),
                  end='')

    print('\nDownloaded and wrote FASTA files for '
          '{0}/{1} loci'.format(downloaded, total))
    print('Failed download for {0} loci.\n'.format(len(failed)))
    if len(failed) > 0:
        sys.exit('Failed download for following loci: {0}\n'
                 'Please download files for failed loci '
                 'through the API or retry full schema '
                 'download'.format(','.join(failed)))

    return ns_files