def write_mapping_file(mapping_generator, outfile, confirm=True): """OUTPUT is mapping file: ------------------------- Note: you will not know the source of the mapping unless you use the optional parameter "add_source=True" to merge_mapping() function col0: Ensembl gene ID col2 "add_source" == 1: NCBI ID gene ID from gene2ensembl col2 "add_source" == 2: NCBI ID gene ID from ncbi_list if symbol == ensembl symbol (i.e. iterate through ncbi list (for each Ensembl ID) on gene_info file and when the symbol found matches the ensembl symbol use this NCBI ID if symbols match only once) """ print("step 6 start: write file from mapping generator of tuples") mapping_file, mapping_filename = safewfile(outfile, prompt=confirm, default='O') count = 0 for item in mapping_generator: count += 1 split_item = list(item) split_item = '\t'.join([str(i) for i in split_item]) mapping_file.write(split_item + "\n") print("total Ensembl IDs uniquely mapped to NCBI gene ID:", count) mapping_file.close() print("Output file: \"{}\"".format(mapping_filename)) print("step 6 end\n") return count
def _fetch_data(self, outfile, attributes, filters='', header=None, debug=False): cnt_all = 0 out_f, outfile = safewfile(outfile,prompt=False,default='O') if header: out_f.write('\t'.join(header) + '\n') for species in self.__class__.species_li: try: dataset = self.get_dataset_name(species) except IndexError: # bad dataset name, skip (this used to be catched in a try/finally # so it wasn't dealth with before) self.logger.debug("Skip species '%s'" % species) continue taxid = species[2] if not dataset: continue xml = self._make_query_xml(dataset, attributes=attributes, filters=filters) if debug: self.logger.info(xml) try: con = self.query_mart(xml) except MartException: import traceback err_msg = traceback.format_exc() self.logger.error("%s %s" % (species[0], err_msg)) continue cnt = 0 for line in con.split('\n'): if line.strip() != '': out_f.write(str(taxid) + '\t' + line + '\n') cnt += 1 cnt_all += 1 self.logger.info("%s %s" % (species[0], cnt)) out_f.close() self.logger.info("Total: %d" % cnt_all)
def write_mapping_file(mapping_generator, confirm=True): """OUTPUT is mapping file: ------------------------- Note: you will not know the source of the mapping unless you use the optional parameter "add_source=True" to merge_mapping() function col0: Ensembl gene ID col2 "add_source" == 1: NCBI ID gene ID from gene2ensembl col2 "add_source" == 2: NCBI ID gene ID from ncbi_list if mygene.info symbol == ensembl symbol (i.e. iterate through ncbi list (for each Ensembl ID) on mygene.info (ex: http://mygene.info/v2/gene/100894237?fields=symbol ) and when the symbol found matches the ensembl symbol use this NCBI ID if symbols match only once) """ print("step 6 start: write file from mapping generator of tuples") mapping_file, mapping_filename = safewfile(outfile, prompt=confirm,default='O') count = 0 for item in mapping_generator: count += 1 split_item = list(item) split_item = '\t'.join([str(i) for i in split_item]) mapping_file.write(split_item + "\n") print("total Ensembl IDs uniquely mapped to NCBI gene ID:", count) mapping_file.close() print("Output file: \"{}\"".format(mapping_filename)) print("step 6 end\n") return count
def file_merge(infiles, outfile=None, header=1, verbose=1): '''merge a list of input files with the same format. if header will be removed from the 2nd files in the list. ''' outfile = outfile or '_merged'.join(os.path.splitext(infiles[0])) out_f, outfile = safewfile(outfile) if verbose: print("Merging...") cnt = 0 for i, fn in enumerate(infiles): print(os.path.split(fn)[1], '...', end='') line_no = 0 in_f = anyfile(fn) if i > 0: for k in range(header): in_f.readline() for line in in_f: out_f.write(line) line_no += 1 in_f.close() cnt += line_no print(line_no) out_f.close() print("=" * 20) print("Done![total %d lines output]" % cnt)
def _fetch_data(self, outfile, attributes, filters='', header=None, debug=False): cnt_all = 0 out_f, outfile = safewfile(outfile, prompt=(not self.no_confirm), default='O') if header: out_f.write('\t'.join(header) + '\n') logging.info('Dumping "%s"...' % os.path.split(outfile)[1]) for species in self.species_li: dataset = self.get_dataset_name(species) taxid = species[2] if not dataset: continue xml = self._make_query_xml(dataset, attributes=attributes, filters=filters) if debug: logging.info(xml) try: con = self.query_mart(xml) except MartException: import traceback err_msg = traceback.format_exc() logging.error("%s %s" % (species[0], err_msg)) continue cnt = 0 for line in con.split('\n'): if line.strip() != '': out_f.write(str(taxid) + '\t' + line + '\n') cnt += 1 cnt_all += 1 logging.info("%s %s" % (species[0], cnt)) out_f.close() logging.info("Total: %d" % cnt_all)
def dispatch(src): src_doc = src_dump.find_one({'_id': src}) datadump_logfile = src_doc.get('logfile', '') if datadump_logfile: upload_logfile = os.path.join(os.path.split(datadump_logfile)[0], '{}_upload.log'.format(src)) else: from config import DATA_ARCHIVE_ROOT upload_logfile = os.path.join(DATA_ARCHIVE_ROOT, '{}_upload.log'.format(src)) log_f, logfile = safewfile(upload_logfile, prompt=False, default='O') p = Popen(['python', '-u', '-m', 'dataload.start', src], stdout=log_f, stderr=STDOUT, cwd=src_path) p.logfile = logfile p.log_f = log_f return p
def dispatch(src): src_doc = src_dump.find_one({'_id': src}) datadump_logfile = src_doc.get('logfile', '') if datadump_logfile: upload_logfile = os.path.join( os.path.split(datadump_logfile)[0], '{}_upload.log'.format(src)) else: from config import DATA_ARCHIVE_ROOT upload_logfile = os.path.join(DATA_ARCHIVE_ROOT, '{}_upload.log'.format(src)) log_f, logfile = safewfile(upload_logfile, prompt=False, default='O') p = Popen(['python', '-u', '-m', 'dataload.start', src], stdout=log_f, stderr=STDOUT, cwd=src_path) p.logfile = logfile p.log_f = log_f return p
def _fetch_data(self, outfile, attributes, filters='', header=None): cnt_lines_all = 0 cnt_species_success = 0 out_f, outfile = safewfile(outfile, prompt=False, default='O') if header: out_f.write('\t'.join(header) + '\n') for count, species in enumerate(self.species_li): try: dataset = self.get_dataset_name(species) except IndexError: self.logger.debug("Skip species '%s'", species) continue if not dataset: continue taxid = species[2] xml = self._make_query_xml( dataset, attributes=attributes, filters=filters) try: con = self.query_mart(xml) except EntrezgeneNotFound as err: if 'xref_entrezgene' in outfile: cnt_species_success += 1 self.logger.warning("%s:: %s: %s", os.path.basename(outfile), species[0], 'Skipping species without entrez gene id') else: self.logger.error("%s:: %s %s", os.path.basename(outfile), species[0], err) continue except GeneNameNotFound as err: _attributes = attributes.copy() _attr_ext_gene_index = attributes.index('external_gene_name') _attributes.remove('external_gene_name') self.logger.debug(_attributes) _xml = self._make_query_xml( dataset, attributes=_attributes, filters=filters) try: con = self.query_mart(_xml) except MartException as err: self.logger.error("%s:: %s %s", os.path.basename(outfile), species[0], err) self.logger.warning("%s:: %s: %s", os.path.basename(outfile), species[0], 'Retried to request species without external gene name') cnt_lines = 0 cnt_species_success += 1 for line in con.split('\n'): if line.strip() != '': tsv = line.split('\t') out_f.write(str(taxid) + '\t' + tsv[0] + '\t\t' + '\t'.join(tsv[1:]) + '\n') cnt_lines += 1 cnt_lines_all += 1 self.logger.info("%s:: %d/%d %s %d records", os.path.basename(outfile), count + 1, len(self.species_li), species[0], cnt_lines) continue except MartException as err: self.logger.error("%s:: %s %s", os.path.basename(outfile), species[0], err) continue cnt_lines = 0 cnt_species_success += 1 if not con: self.logger.error('Empty Response.') for line in con.split('\n'): if line.strip() != '': out_f.write(str(taxid) + '\t' + line + '\n') cnt_lines += 1 cnt_lines_all += 1 self.logger.info("%s:: %d/%d %s %d records", os.path.basename(outfile), count + 1, len(self.species_li), species[0], cnt_lines) out_f.close() self.logger.info("Total: %s:: %d/%d successes %d records", os.path.basename(outfile), cnt_species_success, len(self.species_li), cnt_lines_all)