Exemplo n.º 1
0
def species_schemas(species_id, base_url, headers_get):
    """ Retrieves the species and all the schemas for that
        species.

        Parameters
        ----------
        species_id : str
            The integer identifier of the species in
            the Chewie-NS.
        base_url : str
            Base URL of the Chewie-NS.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        res : list of dict
            The first dictionary contains the species
            URI and name and the following dictionaries
            contain the URI and name for all schemas
            associated with the species.
    """

    endpoint_list = ['species', species_id]
    # unpack list of sequential endpoints and pass to create URI
    res = cr.simple_get_request(base_url, headers_get, endpoint_list)[1]
    res = res.json()

    return res
Exemplo n.º 2
0
def schema_stats(species_id, base_url, headers_get):
    """ Retrieves schema properties, number of loci and
        number of alleles for all schemas of a species in
        the Chewie-NS.

        Parameters
        ----------
        species_id : str
            The integer identifier of the species in
            the Chewie-NS.
        base_url : str
            Base URL of the Chewie-NS.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        res : list of dict or None
            List with one dict per schema ot NoneType
            if it was not possible to retrieve information.
    """

    endpoint_list = ['stats', 'species', species_id, 'totals']
    # unpack list of sequential endpoints and pass to create URI
    res = cr.simple_get_request(base_url, headers_get, endpoint_list)[1]
    status_code = res.status_code
    if status_code not in [200, 201]:
        res = None
    else:
        res = res.json()['message']

    return res
Exemplo n.º 3
0
def schema_loci(schema_uri, headers_get):
    """ Retrieves the list of loci for a schema.

        Parameters
        ----------
        schema_uri : str
            The URI of the schema in the Chewie-NS.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        loci : dict
            A dictionary with loci URIs as keys and
            loci names as values.
    """

    # get the list of loci
    loci_url, loci_res = cr.simple_get_request(schema_uri, headers_get,
                                               ['loci'])
    loci_res = loci_res.json()['Loci']

    # locus URI to locus name
    loci = {}
    for locus in loci_res:
        loci[str(locus['locus']['value'])] = locus['name']['value']

    return loci
Exemplo n.º 4
0
def check_compressed(schema_uri, headers_get):
    """ Determines if there is a compressed version of
        a schema.

        Parameters
        ----------
        schema_uri : str
            The URI of the schema in the Chewie-NS.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        list
            A list with the following elements:

            - The URI for the compressed version of the schema (str).
            - The timestamp of the compressed version. Indicates the
              last modification date of the schema at time of compression.
    """

    zip_uri, zip_response = cr.simple_get_request(
        schema_uri, headers_get, ['zip'], parameters={'request_type': 'check'})
    zip_info = zip_response.json()
    if 'zip' in zip_info:
        zip_file = zip_info['zip'][0]
        zip_date = zip_file.split('_')[-1].split('.zip')[0]
    else:
        zip_date = None

    return [zip_uri, zip_date]
Exemplo n.º 5
0
def get_fasta_seqs(url, headers_get, schema_date):
    """ Retrieves the DNA sequences of a locus in the
        Chewie-NS.

        Parameters
        ----------
        url : str
            Endpoint URL to make the request.
        headers_get : dict
            HTTP headers for GET requests.
        schema_date : str
            The function will only retrieve alleles
            that were inserted up to this date.

        Returns
        -------
        tuple
            Tuple with the following elements:
            - URI of the locus.
            - Response object with the DNA sequences
              that were downloaded.
    """

    payload = {'date': schema_date}
    tries = 0
    max_tries = 3
    downloaded = False
    while downloaded is False:
        res = cr.simple_get_request(url, headers_get, [], payload, False,
                                    180)[1]
        tries += 1
        if res.status_code in [200, 201] or tries == max_tries:
            downloaded = True

    return (url.rstrip('/fasta'), res)
Exemplo n.º 6
0
def download_compressed(zip_uri, species_name, schema_name, download_folder,
                        headers_get):
    """ Downloads and extracts a ZIP archive with a ready-to-use
        version of a schema in the Chewie-NS.

        Parameters
        ----------
        zip_uri : str
            Endpoint URL to make the request to download
            the compressed schema.
        species_name : str
            Scientific name of the schema species.
        schema_name : str
            Name of the schema in the Chewie-NS.
        download_folder : str
            Path to the directory to which the ZIP archive
            will be saved.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        schema_path : str
            ZIP archive contents will be extracted to this
            directory.
    """

    zip_name = '{0}{1}_{2}.zip'.format(species_name[0].lower(),
                                       species_name.split(' ')[-1],
                                       schema_name)
    schema_path = os.path.join(download_folder, zip_name.split('.zip')[0])
    fo.create_directory(schema_path)

    # download ZIP archive
    url, zip_response = cr.simple_get_request(
        zip_uri, headers_get, parameters={'request_type': 'download'})
    zip_path = os.path.join(schema_path, zip_name)
    open(zip_path, 'wb').write(zip_response.content)
    # uncompress
    print('Decompressing schema...')
    shutil.unpack_archive(zip_path, extract_dir=schema_path)
    # delete ZIP
    os.remove(zip_path)

    return schema_path
Exemplo n.º 7
0
def species_schemas_count(base_url, headers_get):
    """ Returns the number of schemas per species in
        the Chewie-NS.

        Parameters
        ----------
        base_url : str
            Base URL of the Chewie-NS.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        info : list of list
            A list with a sublist per species.
            Each sublist contains the species
            identifier, the name of the species
            and the total number of schemas.
    """

    endpoint_list = ['stats', 'species']
    # unpack list of sequential endpoints and pass to create URI
    res = cr.simple_get_request(base_url, headers_get, endpoint_list)[1]
    res = res.json()

    if 'message' in res:
        res = res['message']
    else:
        sys.exit('Could not retrieve species info.')

    if len(res) == 0:
        sys.exit('Could not retrieve species info.')
    else:
        info = []
        for s in res:
            sid = s['species']['value'].split('/')[-1]
            name = s['name']['value']
            num_schemas = s['schemas']['value']
            info.append([sid, name, num_schemas])

    # sort by species identifier
    info = sorted(info, key=lambda x: int(x[0]))

    return info
Exemplo n.º 8
0
def download_ptf(ptf_hash, download_folder, schema_id, species_id,
                 species_name, headers_get, base_url):
    """ Downloads the Prodigal training file for a schema.

        Parameters
        ----------
        ptf_hash : str
            Unique identifier of the Prodigal training file
            (BLAKE2 hash).
        download_folder : str
            Path to the directory to which the Prodigal
            training file should be saved.
        schema_id : str
            The identifier of the schema in the Chewie-NS.
        species_id : str
            The identifier of the schema's species in the
            Chewie-NS.
        species_name : str
            Scientific name of the schema species.
        headers_get : dict
            HTTP headers for GET requests.
        base_url : str
            Base URL of the Chewie Nomenclature server.

        Returns
        -------
        ptf_file : str
            Path to the Prodigal training file.
    """
    ptf_url, ptf_response = cr.simple_get_request(
        base_url, headers_get,
        ['species', species_id, 'schemas', schema_id, 'ptf'])

    ptf_file = os.path.join(download_folder,
                            '{0}.trn'.format(species_name.replace(' ', '_')))

    open(ptf_file, 'wb').write(ptf_response.content)

    return ptf_file
Exemplo n.º 9
0
def main(schema_directory, cpu_cores, nomenclature_server, submit, blast_path,
         update_profiles):

    # get ns configs
    local_date, schema_uri = pv.read_configs(schema_directory, '.ns_config')
    # get schema and species identifiers
    schema_id = schema_uri.split('/')[-1]
    species_id = schema_uri.split('/')[-3]
    if nomenclature_server is None:
        nomenclature_server = schema_uri.split('species/')[0]

    if submit is True and 'tutorial' not in nomenclature_server:
        print('\nOnly authorized registered users may submit new alleles.')
        token = cr.capture_login_credentials(nomenclature_server)
    else:
        token = ''

    # GET request headers
    headers_get = ct.HEADERS_GET_JSON
    headers_get['Authorization'] = token

    # determine current user ID and Role
    if submit is True and 'tutorial' not in nomenclature_server:
        user_id, user_role, user_auth = cr.user_info(nomenclature_server,
                                                     headers_get)
        # verify if user has authorization to submit
        url = cr.make_url(nomenclature_server, 'auth', 'check')
        response = cr.simple_get_request(url, headers_get)[1]
        if response.status_code == 200:
            user_auth = True
        else:
            sys.exit('Current user has no authorization to submit novel '
                     'alleles.\nYou can request authorization to submit '
                     'novel alleles by sending an e-mail to: '
                     '*****@*****.**')
        print('User id: {0}'.format(user_id))
        print('User role: {0}\n'.format(user_role))
    else:
        user_id = ''
        user_role = ''
        user_auth = True if 'tutorial' in nomenclature_server else False

    # POST requests headers
    headers_post = ct.HEADERS_POST_JSON
    headers_post['Authorization'] = token
    headers_post['user_id'] = user_id
    # POST headers to send binary data
    headers_post_bytes = ct.HEADERS_POST
    headers_post_bytes['Authorization'] = token
    headers_post_bytes['user_id'] = user_id

    schema_params = pv.read_configs(schema_directory, '.schema_config')

    # verify that local configs have a single value per parameter
    if all([
            len(schema_params[k]) == 1 for k in schema_params
            if k != 'chewBBACA_version'
    ]) is not True:
        sys.exit('Cannot sync schema with multiple values per parameter.')

    # check if schema exists in the NS
    schema_name, ns_params = cr.get_species_schemas(schema_id, species_id,
                                                    nomenclature_server,
                                                    headers_get)[2:]

    # verify that local configs match NS configs
    # add window size
    if all([
            str(schema_params[k][0]) == ns_params[k]['value']
            for k in schema_params
            if k not in ['chewBBACA_version', 'window_size']
    ]) is not True:
        sys.exit('Local configs do not match Chewie-NS configs.')

    # Get the name of the species from the provided id
    # or vice-versa
    species_id, species_name = cr.species_ids(species_id, nomenclature_server,
                                              headers_get)

    print('Schema id: {0}'.format(schema_id))
    print('Schema name: {0}'.format(schema_name))
    print("Schema's species: {0} (id={1})".format(species_name, species_id))
    print('Last synced: {0}'.format(local_date))

    # get last modification date
    # setting syncing date to last modification date will allow
    # all users to sync even when the schema is locked and being
    # updated by another user
    ns_date = ns_params['last_modified']['value']
    print('\nRemote schema was last modified on: {0}'.format(ns_date))

    # exit if remote schema has not been updated since last
    # sync date and current user does not wish to submit new alleles
    if local_date == ns_date and submit is False:
        sys.exit('\nRemote schema has not been updated since last sync '
                 'process. Local schema is up-to-date.')

    # Create a temporary dir for the new alleles
    temp_dir = os.path.join(os.path.dirname(schema_directory), 'temp')
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)

    # retrieve alleles added to schema after last sync date
    print('\nRetrieving alleles added to remote schema '
          'after {0}...'.format(local_date))
    loci_alleles, server_time, count = retrieve_latest(local_date, schema_uri,
                                                       headers_get, ns_date)

    print('Retrieved {0} alleles for {1} loci.'
          ''.format(count, len(loci_alleles)))

    # Get schema files from genes list file
    genes_list = os.path.join(schema_directory, '.genes_list')
    genes = fo.pickle_loader(genes_list)

    # update loci structure
    not_in_ns, pickled_loci, \
        updated, not_update, \
        rearranged = update_loci_files(loci_alleles, genes,
                                       schema_directory, temp_dir)

    total_local = sum([len(v[0]) for k, v in not_in_ns.items()])
    print('Local schema has {0} novel alleles for {1} '
          'loci.'.format(total_local, len(not_in_ns)))

    # check if there are any changes to make
    if len(pickled_loci) == 0:
        shutil.rmtree(temp_dir)
        sys.exit('Remote schema has not been altered and local schema '
                 'does not have novel alleles.')

    results = {}
    attributed = 0
    if submit is True and user_auth is True and len(not_in_ns) > 0:

        # attempt to lock schema
        lock_res = cr.simple_post_request(
            nomenclature_server,
            headers_post,
            ['species', species_id, 'schemas', schema_id, 'lock'],
            data=json.dumps({'action': 'lock'}))[1]
        # if schema is already locked user cannot send alleles
        lock_status = lock_res.status_code
        if lock_status == 403:
            print('Schema is already locked. Another user might be updating '
                  'the schema. Please repeat the syncing process after a '
                  'while to add your new alleles to the Chewie-NS.\n The '
                  'process will now update your local schema with the alleles '
                  'retrieved from the Chewie-NS.')
        else:

            # after locking, check if date matches ns_date
            date_res = cr.simple_get_request(
                nomenclature_server, headers_get,
                ['species', species_id, 'schemas', schema_id, 'modified'])[1]

            date_value = (date_res.json()).split(' ')[-1]

            if date_value != ns_date:
                print('Data retrieved from the Chewie-NS has an older '
                      'timestamp than current schema timestamp. Schema '
                      'might have been updated before this syncing process. '
                      'Please repeat the syncing process in order to add '
                      'your new alleles to the schema. The process will now '
                      'update your local schema with the alleles retrieved '
                      'from the Chewie-NS.')

                # unlock schema
                lock_res = cr.simple_post_request(
                    nomenclature_server,
                    headers_post,
                    ['species', species_id, 'schemas', schema_id, 'lock'],
                    data=json.dumps({'action': 'unlock'}))[1]
            else:
                print(
                    'Collecting data and creating files to submit local alleles...'
                )
                # get list of loci for schema in the NS
                loci_res = cr.simple_get_request(
                    nomenclature_server, headers_get,
                    ['species', species_id, 'schemas', schema_id, 'loci'])[1]
                # get loci files names from response
                for l in loci_res.json()['Loci']:
                    locus_name = l['name']['value'] + '.fasta'
                    locus_uri = l['locus']['value']
                    if locus_name in not_in_ns:
                        not_in_ns[locus_name].append(locus_uri)

                # create files with length values to update
                length_files = create_lengths_files(not_in_ns, temp_dir)

                # create new alleles data
                alleles_files, \
                    loci_ids, \
                    loci_names = create_alleles_files(not_in_ns, nomenclature_server,
                                                      user_id, species_name,
                                                      species_id, schema_id,
                                                      temp_dir)

                # compress files with new alleles
                zipped_files = [
                    '{0}.zip'.format(file) for file in alleles_files
                ]
                list(map(fo.file_zipper, alleles_files, zipped_files))
                alleles_data = list(zip(zipped_files, loci_ids, loci_names))

                print('Sending and inserting new alleles...')
                failed, \
                    start_count = upload_alleles_data(alleles_data, length_files,
                                                      nomenclature_server, headers_post,
                                                      headers_post_bytes, species_id,
                                                      schema_id)

                # track progress through endpoint
                # set time limit for task completion (seconds)
                print()
                time_limit = 2100
                current_time = 0
                status = 'Updating'
                start_count = int(start_count.json()['nr_alleles'])
                while status != 'Complete' and (current_time < time_limit):
                    insertion_status = cr.simple_get_request(
                        nomenclature_server, headers_get, [
                            'species', species_id, 'schemas', schema_id,
                            'loci', 'locus', 'update'
                        ])[1]
                    insertion_status = insertion_status.json()
                    if 'message' in insertion_status:
                        status = 'Complete'
                        results = insertion_status['identifiers']

                    current_count = int(insertion_status['nr_alleles'])

                    inserted = current_count - start_count
                    print('\r',
                          '    Inserted {0} alleles.'.format(inserted),
                          end='')
                    time.sleep(2)
                    current_time += 2

                if current_time != time_limit:
                    # determine alleles that were attributed an identifier
                    repeated = sum([len(r[0]) for l, r in results.items()])
                    attributed = sum([len(r[1]) for l, r in results.items()])

                    print(
                        '\nThe Chewie-NS inserted {0} new alleles and detected '
                        '{1} repeated alleles.'.format(attributed, repeated))
                else:
                    print(
                        '\nCould not retrieve allele identifiers assigned by '
                        'Chewie-NS. Will adapt schema with retrieved alleles. '
                        'Please repeat the syncing process in order to assign '
                        'the new identifiers for the submitted alleles.')

                # remove files in temp folder
                fo.remove_files(length_files)
                fo.remove_files(alleles_files)
                fo.remove_files(zipped_files)

    # change pickled files to FASTA files
    for locus, pick in pickled_loci.items():
        rearranged = pickle_to_fasta(locus, pick, temp_dir, results,
                                     rearranged)

    # change identifiers in SQLite DB
    if len(rearranged) > 0 and update_profiles is True:
        print('\nUpdating local allele identifiers...')
        altered = ps.update_profiles(schema_directory, rearranged)
        if altered is not None:
            print('Updated {0} profiles.\n'.format(altered))
        else:
            print('Could not find local SQLite database to upload profiles.\n')

    # Re-determine the representative sequences
    if attributed > 0 or count > 0:
        PrepExternalSchema.main(temp_dir, schema_directory, cpu_cores,
                                float(schema_params['bsr'][0]),
                                int(schema_params['minimum_locus_length'][0]),
                                11, '', None, blast_path)

        # delete invalid alleles and genes files
        parent_dir = os.path.dirname(schema_directory)
        files = [
            os.path.join(parent_dir, file) for file in os.listdir(parent_dir)
            if 'invalid' in file
        ]

        fo.remove_files(files)

        # get last modification date
        last_modified = cr.simple_get_request(
            nomenclature_server, headers_get,
            ['species', species_id, 'schemas', schema_id, 'modified'])[1]
        last_modified = (last_modified.json()).split(' ')[-1]
        server_time = last_modified

        # update NS config file with latest server time
        ns_configs = os.path.join(schema_directory, '.ns_config')
        fo.pickle_dumper([server_time, schema_uri], ns_configs)

    print('Received {0} new alleles for {1} loci and sent '
          '{2} for {3} loci. '.format(count, len(pickled_loci), attributed,
                                      len(not_in_ns)))

    # delete temp directory
    shutil.rmtree(temp_dir)

    # delete pre-computed BSR values from 'short' directory
    # representatives might have changed and BSR values are outdated
    short_dir = os.path.join(schema_directory, 'short')
    bsr_files = [
        os.path.join(short_dir, f) for f in os.listdir(short_dir)
        if f.endswith('_bsr.txt')
    ]
    fo.remove_files(bsr_files)
Exemplo n.º 10
0
def retrieve_alleles(loci_new_alleles, server_time, schema_uri, count,
                     headers_get, ns_date):
    """ Retrieves alleles added to a schema in the Chewie-NS
        during a time interval, up to the maximum number of
        alleles that the server returns at a time (50000).

        Parameters
        ----------
        loci_new_alleles : dict
            A dictionary with loci identifiers as keys and
            dictionaries with alleles identifiers and DNA
            sequences as values.
        server_time : str
            The function will return alleles added to the
            schema after this date (format Y-%m-%dT%H:%M:%S).
        schema_uri : str
            The URI of the schema in the Chewie-NS.
        count : int
            The cumulative number of sequences that has been
            returned.
        headers_get : dict
            HTTP headers for GET requests.
        ns_date : str
            The function will return alleles added to the
            schema up to this date (format Y-%m-%dT%H:%M:%S).

        Returns
        -------
        A list with the following variables:

        loci_new_alleles :  dict
            Input `loci_new_alleles` dictionary with alleles
            returned in the current and previous iterations.
        server_time : str
            The date of insertion of the last allele returned by
            the Chewie-NS.
        count : int
            The cumulative number of sequences that has been
            returned.
    """

    # request the new alleles starting on the date given
    url = cr.make_url(schema_uri, 'loci')
    payload = {'local_date': server_time, 'ns_date': ns_date}
    # get the new alleles
    response = cr.simple_get_request(url, headers_get, parameters=payload)[1]
    response_content = response.json()

    # get info about sequences that were added since last date
    new_alleles = response_content['newAlleles']

    # get headers info
    response_headers = response.headers
    if len(new_alleles) > 0:
        # get date of last added allele
        server_time = response_headers['Last-Allele']
        # group retrieved alleles by locus
        for allele in new_alleles:
            locus = '{0}{1}'.format(allele['locus_name']['value'], '.fasta')
            allele_id = allele['allele_id']['value']
            sequence = allele['nucSeq']['value']

            loci_new_alleles[locus][allele_id] = sequence

        # keep count of the number of retrieved alleles
        count += len(new_alleles)
    else:
        # get current server date if no alleles were added
        # since last server time
        server_time = response_headers['Server-Date']

    return (loci_new_alleles, server_time, count)