def load_data(data_folder): input_fn = os.path.join(data_folder,"biomuta-master.csv") open_file = open(input_fn) db_biomuta = csv.reader(open_file) index = next(db_biomuta) assert len(index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index)) index = [clean_index(s) for s in index] biomuta = (dict(zip(index, row)) for row in db_biomuta) json_rows = map(_map_line_to_json, biomuta) fd_tmp, tmp_path = mkstemp(dir=data_folder) try: with open(tmp_path, "w") as f: dbwriter = csv.writer(f) for i, doc in enumerate(json_rows): if doc: dbwriter.writerow([doc['_id'], json.dumps(doc)]) csvsort(tmp_path, [0,], has_header=False) with open(tmp_path) as csvfile: json_rows = csv.reader(csvfile) json_rows = (json.loads(row[1]) for row in json_rows) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) json_rows = (merge_duplicate_rows(rg, "biomuta") for rg in row_groups) json_rows = (unlist(dict_sweep(row, vals=[None, ])) for row in json_rows) for res in json_rows: yield res finally: os.remove(tmp_path)
def data_generator(input_file, version): open_file = open(input_file) evs = csv.reader(open_file, delimiter=" ") # Skip first 8 meta lines evs = islice(evs, 8, None) evs = (row for row in evs if ":" in row[30] and len(row) == VALID_COLUMN_NO) # skip rows with multiple mutations evs = (row for row in evs if len(row[3].split(";")) == 1) json_rows = map(partial(_map_line_to_json, version=version), evs) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "evs") for rg in row_groups)
def data_generator(input_file): # sort by the first column (hgvs id returned from Mutalyzer) # TODO: use some python there... os.system("sort -k1 -n %s > %s.sorted" % (input_file, input_file)) open_file = open("%s.sorted" % (input_file)) emv = csv.reader(open_file, delimiter=",") # Skip header next(emv) emv = filter(lambda x: x[0], emv) json_rows = map(_map_line_to_json, emv) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "emv") for rg in row_groups)
def load_data(input_file): os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \ % (input_file, input_file)) open_file = open("%s_sorted.tsv" % (input_file)) print input_file clinvar = csv.reader(open_file, delimiter="\t") clinvar = (row for row in clinvar if row[18] != '-' and row[18].find('?') == -1 and row[13] != "" and row[12] == "GRCh37" and not re.search(r'p.', row[18])) json_rows = (row for row in imap(_map_line_to_json, clinvar) if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups)
def load_data(data_folder): tar = tarfile.open( os.path.join(data_folder, "Kaviar-160204-Public-hg19.vcf.tar")) member = tar.getmember( "Kaviar-160204-Public/vcfs/Kaviar-160204-Public-hg19.vcf.gz") member.name = os.path.basename(member.name) tar.extract(member, path=data_folder) tar.close() input_fn = os.path.join(data_folder, "Kaviar-160204-Public-hg19.vcf.gz") vcf_reader = vcf.Reader(filename=input_fn, compressed=True, strict_whitespace=True) json_rows = map(_map_line_to_json, vcf_reader) json_rows = chain.from_iterable(json_rows) fd_tmp, tmp_path = mkstemp(dir=data_folder) try: with open(tmp_path, "w") as f: dbwriter = csv.writer(f) for doc in json_rows: if doc: dbwriter.writerow([doc['_id'], json.dumps(doc)]) csvsort(tmp_path, [ 0, ]) with open(tmp_path) as csvfile: json_rows = csv.reader(csvfile) json_rows = (json.loads(row[1]) for row in json_rows) row_groups = ( it for (key, it) in groupby(json_rows, lambda row: row["_id"])) json_rows = (merge_duplicate_rows(rg, "kaviar") for rg in row_groups) import logging for row in json_rows: logging.debug(row) res = unlist(dict_sweep(row, vals=[ None, ])) yield res finally: os.remove(tmp_path) os.remove(input_fn)
def fetch_generator(tabix, contig): dbfile_path = 'home/kevinxin/cadd/' + 'cadd_id' + contig db = dbm.open(dbfile_path) ids = db.keys() set_ids = set(ids) print(len(ids)) fetch = tabix.fetch(contig) rows = map(lambda x: x.split('\t'), fetch) # looking for annotype as 'codingtranscript', 'noncodingtranscript' annos = (row for row in rows if "CodingTranscript" in row[9] or get_hgvs_from_vcf(row[0], row[1], row[2], row[4]) in set_ids) json_rows = map(_map_line_to_json, annos) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
def load_data(input_file): src_db = mongo.get_src_db() if not "dbsnp_hg19" in src_db.collection_names(): raise ValueError("'dbsnp_hg19' collection is missing, run dbsnp uploader first") dbsnp_col = src_db["dbsnp_hg19"] open_file = open(input_file,encoding="cp1252") open_file = csv.reader(open_file, delimiter="\t") next(open_file) grasp = map(row_generator, open_file) grasp = filter(lambda row: row[58] != "", grasp) json_rows = map(partial(_map_line_to_json,dbsnp_col=dbsnp_col), grasp) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) for row in (merge_duplicate_rows(rg, "grasp") for rg in row_groups): yield row
def load_data(input_file): src_db = mongo.get_src_db() if not "dbsnp_hg19" in src_db.collection_names(): raise ValueError( "'dbsnp_hg19' collection is missing, run dbsnp uploader first") dbsnp_col = src_db["dbsnp_hg19"] open_file = open(input_file, encoding="cp1252") open_file = csv.reader(open_file, delimiter="\t") next(open_file) grasp = map(row_generator, open_file) grasp = filter(lambda row: row[58] != "", grasp) json_rows = map(partial(_map_line_to_json, dbsnp_col=dbsnp_col), grasp) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) for row in (merge_duplicate_rows(rg, "grasp") for rg in row_groups): yield row
def load_data(data_folder): input_fn = os.path.join(data_folder, "CCLE_DepMap_18q3_maf_20180718.txt") db_ccle = csv.reader(open(input_fn), delimiter='\t') index = next(db_ccle) assert len(index) == VALID_COLUMN_NO, \ "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index)) index = [clean_index(s) for s in index] ccle = (dict(zip(index, row)) for row in db_ccle) ccle = filter(lambda row: row["chromosome"] != "", ccle) json_rows = map(_map_line_to_json, ccle) json_rows = (row for row in json_rows if row) json_rows = sorted(json_rows, key=lambda k: k['_id']) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) json_rows = (merge_duplicate_rows(rg, "ccle") for rg in row_groups) return (unlist(dict_sweep(row, vals=[ None, ])) for row in json_rows)
def load_data(input_file): # os.system("sort -t$'\t' -k18 -k14 %s > %s_sorted.tsv" % (input_file, input_file)) # open_file = open("%s_sorted.tsv" % (input_file)) open_file = open(input_file) open_file = csv.reader(open_file, delimiter="\t") cosmic = [] for row in open_file: try: c = row[13].split(".")[1] except: c = "" row.append(row[17].split("-")[0] + "." + c) cosmic.append(row) if row[-1] != "": print(row[-1]) cosmic = sorted(cosmic, key=operator.itemgetter(17), reverse=True) cosmic = filter(lambda row: row[17] != "" and row[13] != "", cosmic) json_rows = map(_map_line_to_json, cosmic) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "cosmic") for rg in row_groups)
def load_data(data_folder): input_fn = os.path.join(data_folder, "denovo-db.non-ssc-samples.variants.tsv") open_file = open(input_fn) db_denovodb = csv.reader(open_file, delimiter="\t") index = next(db_denovodb) while index[0].startswith("##"): index = next(db_denovodb) assert len( index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % ( VALID_COLUMN_NO, len(index)) index = [clean_index(s) for s in index] denovodb = (dict(zip(index, row)) for row in db_denovodb) denovodb = filter(lambda row: row["Chr"] != "", denovodb) json_rows = map(_map_line_to_json, denovodb) json_rows = (row for row in json_rows if row) json_rows = sorted(json_rows, key=lambda row: row["_id"]) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) json_rows = (merge_duplicate_rows(rg, "denovodb") for rg in row_groups) return (unlist(dict_sweep(row, vals=[ None, ])) for row in json_rows)