def __init__(self):
     communicator = HALCommunicator(cfg.query('biosamples', 'aap_url'),
                                    cfg.query('biosamples', 'bsd_url'),
                                    cfg.query('biosamples', 'username'),
                                    cfg.query('biosamples', 'password'))
     self.submitter = BSDSubmitter(communicator,
                                   cfg.query('biosamples', 'domain'))
Exemplo n.º 2
0
def check_submitted_variant_flanks(mongo_client, ssid):
    samtools = cfg.query('executable', 'samtools', ret_default='samtools')
    sve_collection = mongo_client['eva_accession_sharded']['dbsnpSubmittedVariantEntity']
    cursor = sve_collection.find({'accession': int(ssid), 'remappedFrom': {'$exists': False}})
    flank_size = 50
    variant_records = list(cursor)
    id_2_info = {}
    for variant_rec in variant_records:
        flank_up_coord = f"{variant_rec['contig']}:{variant_rec['start'] - flank_size}-{variant_rec['start'] - 1}"
        flank_down_coord = f"{variant_rec['contig']}:{variant_rec['start'] + 1}-{variant_rec['start'] + flank_size}"
        genome_assembly_fasta = get_genome(assembly_accession=variant_rec['seq'], taxonomy=variant_rec['tax'])
        command = f"{samtools} faidx {genome_assembly_fasta} {flank_up_coord} | grep -v '^>' | sed 's/\\n//' "
        flank_up = run_command_with_output(f'Extract upstream sequence using {flank_up_coord}',  command, return_process_output=True).strip().upper()
        command = f"{samtools} faidx {genome_assembly_fasta} {flank_down_coord} | grep -v '^>' | sed 's/\\n//' "
        flank_down = run_command_with_output(f'Extract downstream sequence using {flank_down_coord}',  command, return_process_output=True).strip().upper()
        id_2_info[variant_rec['_id']] = {'variant_rec': variant_rec, 'flank_up': flank_up, 'flank_down': flank_down}

    for variant_id1, variant_id2 in list(itertools.combinations(id_2_info, 2)):
        alignment, strand = compare_variant_flanks(
            id_2_info[variant_id1]['flank_up'] + id_2_info[variant_id1]['variant_rec']['ref'] + id_2_info[variant_id1]['flank_down'],
            id_2_info[variant_id2]['flank_up'] + id_2_info[variant_id2]['variant_rec']['ref'] + id_2_info[variant_id2]['flank_down']
        )
        output = format_output(
            ssid, id_2_info[variant_id1]['variant_rec'], id_2_info[variant_id2]['variant_rec'], alignment, strand,
            id_2_info[variant_id1]['flank_up'], id_2_info[variant_id1]['flank_down'],
            id_2_info[variant_id2]['flank_up'], id_2_info[variant_id2]['flank_down']
        )
        print(output)
Exemplo n.º 3
0
 def upload_vcf_files_to_ena_ftp(self, files_to_upload):
     self.info('Connect to %s', cfg.query('ena', 'ftphost'))
     ftps = HackFTP_TLS()
     host = cfg.query('ena', 'ftphost')
     ftps.connect(host,
                  port=int(cfg.query('ena', 'ftpport', ret_default=21)))
     ftps.login(cfg.query('ena', 'username'), cfg.query('ena', 'password'))
     ftps.prot_p()
     if self.eload not in ftps.nlst():
         self.info('Create %s directory' % self.eload)
         ftps.mkd(self.eload)
     ftps.cwd(self.eload)
     for file_to_upload in files_to_upload:
         file_name = os.path.basename(file_to_upload)
         self.info('Upload %s to FTP' % file_name)
         with open(file_to_upload, 'rb') as open_file:
             ftps.storbinary('STOR %s' % file_name, open_file)
Exemplo n.º 4
0
 def upload_xml_files_to_ena(self, submission_file, project_file,
                             analysis_file):
     response = requests.post(
         cfg.query('ena', 'submit_url'),
         auth=HTTPBasicAuth(cfg.query('ena', 'username'),
                            cfg.query('ena', 'password')),
         files=dict(SUBMISSION=(os.path.basename(submission_file),
                                get_file_content(submission_file),
                                'application/xml'),
                    PROJECT=(os.path.basename(project_file),
                             get_file_content(project_file),
                             'application/xml'),
                    ANALYSIS=(os.path.basename(analysis_file),
                              get_file_content(analysis_file),
                              'application/xml')))
     self.results['receipt'] = response.text
     self.results.update(self.parse_ena_receipt(response.text))
     if self.results['errors']:
         self.error('\n'.join(self.results['errors']))
Exemplo n.º 5
0
def get_hold_date_from_ena(project_accession, project_alias=None):
    """Gets hold date from ENA"""
    if not project_alias:
        project_alias = get_project_alias(project_accession)

    xml_request = f'''<SUBMISSION_SET>
           <SUBMISSION>
               <ACTIONS>
                   <ACTION>
                       <RECEIPT target="{project_alias}"/>
                  </ACTION>
              </ACTIONS>
           </SUBMISSION>
       </SUBMISSION_SET>'''
    response = requests.post(cfg.query('ena', 'submit_url'),
                             auth=HTTPBasicAuth(cfg.query('ena', 'username'),
                                                cfg.query('ena', 'password')),
                             files={'SUBMISSION': xml_request})
    receipt = ET.fromstring(response.text)
    hold_date = None
    try:
        hold_date = receipt.findall('PROJECT')[0].attrib['holdUntilDate']
        hold_date = datetime.strptime(hold_date.replace(':', ''), '%Y-%m-%d%z')
    except (IndexError, KeyError):
        # if there's no hold date, assume it's already been made public
        xml_root = download_xml_from_ena(
            f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_accession}')
        attributes = xml_root.xpath(
            '/PROJECT_SET/PROJECT/PROJECT_ATTRIBUTES/PROJECT_ATTRIBUTE')
        for attr in attributes:
            if attr.findall('TAG')[0].text == 'ENA-FIRST-PUBLIC':
                hold_date = attr.findall('VALUE')[0].text
                hold_date = datetime.strptime(hold_date, '%Y-%m-%d')
                break
        if not hold_date:
            raise ValueError(
                f"Couldn't get hold date from ENA for {project_accession} ({project_alias})"
            )
    return hold_date
Exemplo n.º 6
0
def get_genome_fasta_and_report(species_name,
                                assembly_accession,
                                output_directory=None,
                                overwrite=False):
    output_directory = output_directory or cfg.query('genome_downloader',
                                                     'output_directory')
    assembly = NCBIAssembly(assembly_accession,
                            species_name,
                            output_directory,
                            eutils_api_key=cfg['eutils_api_key'])
    if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile(
            assembly.assembly_report_path) or overwrite:
        assembly.download_or_construct(overwrite=overwrite)
    return assembly.assembly_fasta_path, assembly.assembly_report_path
Exemplo n.º 7
0
    def upload_xml_files_to_ena(self, submission_file, project_file,
                                analysis_file):
        file_dict = {
            'SUBMISSION':
            (os.path.basename(submission_file),
             get_file_content(submission_file), 'application/xml'),
            'ANALYSIS': (os.path.basename(analysis_file),
                         get_file_content(analysis_file), 'application/xml')
        }
        # If we are uploading to an existing project the project_file is not set
        if project_file:
            file_dict['PROJECT'] = (os.path.basename(project_file),
                                    get_file_content(project_file),
                                    'application/xml')

        response = requests.post(cfg.query('ena', 'submit_url'),
                                 auth=HTTPBasicAuth(
                                     cfg.query('ena', 'username'),
                                     cfg.query('ena', 'password')),
                                 files=file_dict)
        self.results['receipt'] = response.text
        self.results.update(self.parse_ena_receipt(response.text))
        if self.results['errors']:
            self.error('\n'.join(self.results['errors']))
Exemplo n.º 8
0
def get_reference_fasta_and_report(species_name, reference_accession, output_directory=None, overwrite=False):
    output_directory = output_directory or cfg.query('genome_downloader', 'output_directory')
    if NCBIAssembly.is_assembly_accession_format(reference_accession):
        assembly = NCBIAssembly(
            reference_accession, species_name, output_directory,
            eutils_api_key=cfg['eutils_api_key']
        )
        if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile(assembly.assembly_report_path) or overwrite:
            assembly.download_or_construct(overwrite=overwrite)
        return assembly.assembly_fasta_path, assembly.assembly_report_path
    elif NCBISequence.is_genbank_accession_format(reference_accession):
        reference = NCBISequence(reference_accession, species_name, output_directory,
                                 eutils_api_key=cfg['eutils_api_key'])
        if not os.path.isfile(reference.sequence_fasta_path) or overwrite:
            reference.download_contig_sequence_from_ncbi(genbank_only=True)
        return reference.sequence_fasta_path, None
    else:
        logger.warning(f'{reference_accession} is not recognize as either an INSDC assembly or sequence.')
Exemplo n.º 9
0
def get_genome(taxonomy, assembly_accession):
    return os.path.join(
        cfg.query('genome_downloader', 'output_directory'),
        get_scientific_name_from_taxonomy(taxonomy).lower().replace(' ', '_'),
        assembly_accession, assembly_accession + '.fa'
    )