Python merge_duplicate_rows示例，utils.dataload.merge_duplicate_rows Python示例

示例#1

0

显示文件

文件： cadd_parser.py 项目： mjuchler/myvariant.info

def fetch_generator(tabix, contig):
    fetch = tabix.fetch(contig)
    rows = map(lambda x: x.split('\t'), fetch)
    annos = (row for row in rows if "CodingTranscript" in row[9])
    json_rows = map(_map_line_to_json, annos)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)

示例#2

0

显示文件

文件： cadd_parser.py 项目： mjuchler/myvariant.info

def fetch_generator(tabix, contig):
    fetch = tabix.fetch(contig)
    rows = map(lambda x: x.split('\t'), fetch)
    annos = (row for row in rows if "CodingTranscript" in row[9])
    json_rows = map(_map_line_to_json, annos)
    json_rows = (row for row in json_rows if row)
    row_groups = (it
                  for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)

示例#3

0

显示文件

文件： grasp_parser.py 项目： mjuchler/myvariant.info

def load_data(input_file):
    open_file = open("%s.tsv" % input_file)
    open_file = csv.reader(open_file, delimiter="\t")
    open_file.next()
    grasp = imap(row_generator, open_file)
    grasp = ifilter(lambda row: row[58] != "", grasp)
    json_rows = imap(_map_line_to_json, grasp)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "grasp") for rg in row_groups)

示例#4

0

显示文件

文件： grasp_parser.py 项目： mmayers12/myvariant.info

def load_data(input_file):
    open_file = open('%s.tsv' % input_file)
    open_file = csv.reader(open_file, delimiter="\t")
    open_file.next()
    grasp = map(row_generator, open_file)
    grasp = ifilter(lambda row: row[58] != "", grasp)
    json_rows = map(_map_line_to_json, grasp)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "grasp") for rg in row_groups)

示例#5

0

显示文件

文件： drugbank_parser.py 项目： mmayers12/myvariant.info

def load_data(input_file):
    """
    write_file output and csv.reader input_file
    '/opt/myvariant.info/load_archive/drugbank/drugbank.csv'
    """
    open_file = open(input_file)
    drugbank = csv.reader(open_file, delimiter=',')
    drugbank.next()
    json_rows = imap(_map_line_to_json, drugbank)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row['_id']))
    return (merge_duplicate_rows(rg, 'drugbank') for rg in row_groups)

示例#6

0

显示文件

def data_generator(input_file):
    # sort by the first column (hgvs id returned from Mutalyzer)
    os.system("sort -k1 -n %s > %s.sorted" % (input_file, input_file))
    open_file = open("%s.sorted" % (input_file))
    emv = csv.reader(open_file, delimiter=",")
    # Skip header
    emv.next()
    emv = filter(lambda x: x[0], emv)
    json_rows = map(_map_line_to_json, emv)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "emv") for rg in row_groups)

示例#7

0

显示文件

文件： evs_parser.py 项目： bainscou/myvariant.info

def data_generator(input_file):
    open_file = open(input_file)
    evs = csv.reader(open_file, delimiter=" ")
    # Skip first 8 meta lines
    evs = islice(evs, 8, None)
    evs = (row for row in evs if ":" in row[30] and
           len(row) == VALID_COLUMN_NO)
    # skip rows with multiple mutations
    evs = (row for row in evs if len(row[3].split(";")) == 1)
    json_rows = map(_map_line_to_json, evs)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "evs") for rg in row_groups)

示例#8

0

显示文件

文件： evs_parser.py 项目： IsmailM/myvariant.info

def data_generator(input_file, version):
    open_file = open(input_file)
    evs = csv.reader(open_file, delimiter=" ")
    # Skip first 8 meta lines
    evs = islice(evs, 8, None)
    evs = (row for row in evs
           if ":" in row[30] and len(row) == VALID_COLUMN_NO)
    # skip rows with multiple mutations
    evs = (row for row in evs if len(row[3].split(";")) == 1)
    json_rows = map(_map_line_to_json, version=version, evs)
    row_groups = (it
                  for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "evs") for rg in row_groups)

示例#9

0

显示文件

文件： clinvar_parser.py 项目： mmayers12/myvariant.info

def load_data(input_file):
    os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \
              % (input_file, input_file))
    open_file = open("%s_sorted.tsv" % (input_file))
    print input_file
    clinvar = csv.reader(open_file, delimiter="\t")
    clinvar = (row for row in clinvar
               if row[18] != '-' and row[18].find('?') == -1 and row[13] != ""
               and row[12] == "GRCh37" and not re.search(r'p.', row[18]))
    json_rows = (row for row in imap(_map_line_to_json, clinvar) if row)
    row_groups = (it
                  for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups)

示例#10

0

显示文件

文件： cadd_parser.py 项目： bainscou/myvariant.info

def fetch_generator(tabix, contig):
    dbfile_path = 'home/kevinxin/cadd/' + 'cadd_id' + contig
    db = dbm.open(dbfile_path)
    ids = db.keys()
    set_ids = set(ids)
    print(len(ids))
    fetch = tabix.fetch(contig)
    rows = imap(lambda x: x.split('\t'), fetch)
#   looking for annotype as 'codingtranscript', 'noncodingtranscript'
    annos = (row for row in rows if "CodingTranscript" in row[9] or
             get_hgvs_from_vcf(row[0], row[1], row[2], row[4]) in set_ids)
    json_rows = imap(_map_line_to_json, annos)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)

示例#11

0

显示文件

文件： clinvar_parser.py 项目： bainscou/myvariant.info

def load_data(input_file):
    os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \
              % (input_file, input_file))
    open_file = open("%s_sorted.tsv" % (input_file))
    print input_file
    clinvar = csv.reader(open_file, delimiter="\t")
    clinvar = (row for row in clinvar
               if row[18] != '-' and
               row[18].find('?') == -1 and
               row[13] != "" and
               row[12] == "GRCh37" and
               not re.search(r'p.', row[18]))
    json_rows = (row for row in imap(_map_line_to_json, clinvar) if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row:
                  row["_id"]))
    return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups)

示例#12

0

显示文件

文件： cadd_parser.py 项目： mmayers12/myvariant.info

def fetch_generator(tabix, contig):
    dbfile_path = 'home/kevinxin/cadd/' + 'cadd_id' + contig
    db = dbm.open(dbfile_path)
    ids = db.keys()
    set_ids = set(ids)
    print(len(ids))
    fetch = tabix.fetch(contig)
    rows = imap(lambda x: x.split('\t'), fetch)
    #   looking for annotype as 'codingtranscript', 'noncodingtranscript'
    annos = (row for row in rows if "CodingTranscript" in row[9]
             or get_hgvs_from_vcf(row[0], row[1], row[2], row[4]) in set_ids)
    json_rows = imap(_map_line_to_json, annos)
    json_rows = (row for row in json_rows if row)
    row_groups = (it
                  for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)

示例#13

0

显示文件

文件： cosmic_parser.py 项目： mmayers12/myvariant.info

def load_data(input_file):
    # os.system("sort -t$'\t' -k18 -k14 %s > %s_sorted.tsv" % (input_file, input_file))
    # open_file = open("%s_sorted.tsv" % (input_file))
    open_file = open(input_file)
    open_file = csv.reader(open_file, delimiter="\t")
    cosmic = []
    for row in open_file:
        try:
            c = row[13].split(".")[1]
        except:
            c = ""
        row.append(row[17].split("-")[0] + "." + c)
        cosmic.append(row)
        if row[-1] != "":
            print row[-1]
    cosmic = sorted(cosmic, key=operator.itemgetter(17), reverse=True)
    cosmic = ifilter(lambda row:
                     row[17] != "" and
                     row[13] != "", cosmic)
    json_rows = imap(_map_line_to_json, cosmic)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cosmic") for rg in row_groups)