def fetch_generator(tabix, contig): fetch = tabix.fetch(contig) rows = map(lambda x: x.split('\t'), fetch) annos = (row for row in rows if "CodingTranscript" in row[9]) json_rows = map(_map_line_to_json, annos) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
def load_data(input_file): open_file = open("%s.tsv" % input_file) open_file = csv.reader(open_file, delimiter="\t") open_file.next() grasp = imap(row_generator, open_file) grasp = ifilter(lambda row: row[58] != "", grasp) json_rows = imap(_map_line_to_json, grasp) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "grasp") for rg in row_groups)
def load_data(input_file): open_file = open('%s.tsv' % input_file) open_file = csv.reader(open_file, delimiter="\t") open_file.next() grasp = map(row_generator, open_file) grasp = ifilter(lambda row: row[58] != "", grasp) json_rows = map(_map_line_to_json, grasp) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "grasp") for rg in row_groups)
def load_data(input_file): """ write_file output and csv.reader input_file '/opt/myvariant.info/load_archive/drugbank/drugbank.csv' """ open_file = open(input_file) drugbank = csv.reader(open_file, delimiter=',') drugbank.next() json_rows = imap(_map_line_to_json, drugbank) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row['_id'])) return (merge_duplicate_rows(rg, 'drugbank') for rg in row_groups)
def data_generator(input_file): # sort by the first column (hgvs id returned from Mutalyzer) os.system("sort -k1 -n %s > %s.sorted" % (input_file, input_file)) open_file = open("%s.sorted" % (input_file)) emv = csv.reader(open_file, delimiter=",") # Skip header emv.next() emv = filter(lambda x: x[0], emv) json_rows = map(_map_line_to_json, emv) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "emv") for rg in row_groups)
def data_generator(input_file): open_file = open(input_file) evs = csv.reader(open_file, delimiter=" ") # Skip first 8 meta lines evs = islice(evs, 8, None) evs = (row for row in evs if ":" in row[30] and len(row) == VALID_COLUMN_NO) # skip rows with multiple mutations evs = (row for row in evs if len(row[3].split(";")) == 1) json_rows = map(_map_line_to_json, evs) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "evs") for rg in row_groups)
def data_generator(input_file, version): open_file = open(input_file) evs = csv.reader(open_file, delimiter=" ") # Skip first 8 meta lines evs = islice(evs, 8, None) evs = (row for row in evs if ":" in row[30] and len(row) == VALID_COLUMN_NO) # skip rows with multiple mutations evs = (row for row in evs if len(row[3].split(";")) == 1) json_rows = map(_map_line_to_json, version=version, evs) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "evs") for rg in row_groups)
def load_data(input_file): os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \ % (input_file, input_file)) open_file = open("%s_sorted.tsv" % (input_file)) print input_file clinvar = csv.reader(open_file, delimiter="\t") clinvar = (row for row in clinvar if row[18] != '-' and row[18].find('?') == -1 and row[13] != "" and row[12] == "GRCh37" and not re.search(r'p.', row[18])) json_rows = (row for row in imap(_map_line_to_json, clinvar) if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups)
def fetch_generator(tabix, contig): dbfile_path = 'home/kevinxin/cadd/' + 'cadd_id' + contig db = dbm.open(dbfile_path) ids = db.keys() set_ids = set(ids) print(len(ids)) fetch = tabix.fetch(contig) rows = imap(lambda x: x.split('\t'), fetch) # looking for annotype as 'codingtranscript', 'noncodingtranscript' annos = (row for row in rows if "CodingTranscript" in row[9] or get_hgvs_from_vcf(row[0], row[1], row[2], row[4]) in set_ids) json_rows = imap(_map_line_to_json, annos) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
def load_data(input_file): # os.system("sort -t$'\t' -k18 -k14 %s > %s_sorted.tsv" % (input_file, input_file)) # open_file = open("%s_sorted.tsv" % (input_file)) open_file = open(input_file) open_file = csv.reader(open_file, delimiter="\t") cosmic = [] for row in open_file: try: c = row[13].split(".")[1] except: c = "" row.append(row[17].split("-")[0] + "." + c) cosmic.append(row) if row[-1] != "": print row[-1] cosmic = sorted(cosmic, key=operator.itemgetter(17), reverse=True) cosmic = ifilter(lambda row: row[17] != "" and row[13] != "", cosmic) json_rows = imap(_map_line_to_json, cosmic) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "cosmic") for rg in row_groups)