def extract_citations_unicarbkb_ds(species):

    black_list = get_blacklisted_pmids(species)

    data_frame = {}
    in_file = path_obj[
        "unreviewed"] + "%s_proteoform_glycosylation_sites_unicarbkb.csv" % (
            species)
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]

    newrow = [
        "uniprotkb_canonical_ac", "pmid", "title", "journal_name",
        "publication_date", "authors"
    ]
    print "\"%s\"" % ("\",\"".join(newrow))
    seen = {}
    for row in data_frame["data"]:
        canon = row[f_list.index("uniprotkb_canonical_ac")]
        pmid = row[f_list.index("evidence")]
        if pmid in black_list:
            continue
        combo_id = "%s %s" % (canon, pmid)
        newrow = get_citation(pmid)
        if newrow != []:
            if combo_id not in seen:
                print "\"%s\"" % ("\",\"".join([canon] + newrow))
            seen[combo_id] = True

    return
Exemplo n.º 2
0
def make_taxid2name_ds():

    seen = {}
    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/taxa.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        tax_id = row[f_list.index("TaxID")]
        seen[tax_id] = True

    newrow = ["tax_id", "tax_name"]
    print "\"%s\"" % ("\",\"".join(newrow))

    in_file = path_obj["downloads"] + "/ncbi/taxonomy/names.dmp"
    with open(in_file, "r") as FR:
        for line in FR:
            parts = line.strip().split("|")
            if parts[3].strip() == "scientific name":
                tax_id = parts[0].strip()
                tax_name = parts[1].strip()
                if tax_id in seen:
                    newrow = [tax_id, tax_name]
                    print "\"%s\"" % ("\",\"".join(newrow))
    return
Exemplo n.º 3
0
def extract_sequences_smiles_isomeric_ds():

    cid2glytoucan = {}
    data_frame = {}
    in_file = path_obj["unreviewed"] +  "glycan_xref_pubchem.csv" 
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        database_id = row[f_list.index("database_id")]
        if database_id[0:3] == "CID":
            cid2glytoucan[database_id[3:]] = ac

    newrow = ["glytoucan_ac","pubchem_id", "sequence_smiles_isomeric"]
    print "\"%s\"" % ("\",\"".join(newrow))
    in_file = path_obj["downloads"] + "pubchem/compound/cid2smiles.tsv"
    with open(in_file, "r") as FR:
        for line in FR:
            cid, smiles = line.strip().split("\t")
            if cid in cid2glytoucan:
                newrow = [cid2glytoucan[cid], cid, str(smiles)]
                print "\"%s\"" % ("\",\"".join(newrow))



    return
Exemplo n.º 4
0
def extract_xref_chebi_from_kegg_ds():


    kegg2chebi = {}
    data_frame = {}
    in_file = path_obj["downloads"] + "chebi/database_accession_current.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        if row[f_list.index("SOURCE")] == "KEGG GLYCAN":
            chebi_id = row[f_list.index("COMPOUND_ID")]
            kegg_id = row[f_list.index("ACCESSION_NUMBER")]
            kegg2chebi[kegg_id] = chebi_id


    newrow = ["glytoucan_ac","database_id","database_label"]
    print "\"%s\"" % ("\",\"".join(newrow))

    in_file = path_obj["unreviewed"] +  "glycan_xref_kegg.csv" 
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        kegg_id = row[f_list.index("database_id")]
        if kegg_id in kegg2chebi:
            chebi_id = kegg2chebi[kegg_id]
            newrow = [ac,chebi_id,"ChEBI"]
            print "\"%s\"" % ("\",\"".join(newrow))



    return
def main():

    in_file = "generated/misc/field_names.csv"
    data_frame = {}
    libgly.load_sheet(data_frame, in_file, ",")
    seen = {}
    for row in data_frame["data"]:
        seen[row[0]] = True
    

    config_obj = json.loads(open("/data/projects/glygen/generated/misc/dataset-masterlist.json", "r").read())
   
    for species in ["human", "mouse", "rat"]:
        for cat in ["protein", "glycan", "proteoform"]:
            ds_list = config_obj[cat]["common"]
            ds_list += config_obj[cat][species] if species in config_obj[cat] else []

            for ds in ds_list:
                if ds in ["allsequences", "canonicalsequences"]:
                    continue
                ext = "csv"
                file_name = "%s_%s_%s.%s" % (species, cat, ds, ext)
                in_file = "unreviewed/%s" % (file_name)
                if os.path.isfile(in_file) == True:
                    data_frame = {}
                    libgly.load_sheet(data_frame, in_file, ",")
                    f_list = data_frame["fields"]
                    for f in f_list:
                        if f not in seen:
                            print "%s,%s" % (f, file_name)
Exemplo n.º 6
0
def extract_motif_ds():

    glycan_list = load_glycan_masterlist()
    newrow = ["glytoucan_ac","glytoucan_ac_motif","motif_name", "is_reducing_end"]
    print "\"%s\"" % ("\",\"".join(newrow))

    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/allmotif.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    seen = {}
    for row in data_frame["data"]:
        ac = row[f_list.index("GlyTouCanAccession")]
        motif_ac = row[f_list.index("MotifAccession")]
        motif_label = row[f_list.index("Label")]
        is_reducing_end = row[f_list.index("IsReducingEnd")]
        combo_id = "%s %s" % (ac, motif_ac)
        if ac in glycan_list:
            if combo_id not in seen:
                seen[combo_id] = True
                newrow = [ac, motif_ac, motif_label, is_reducing_end]
                print "\"%s\"" % ("\",\"".join(newrow))


    return
Exemplo n.º 7
0
def extract_images_ds():

    images_dir = path_obj["downloads"] +  "glytoucan/current/export/images-cfg-extended/cfg/extended/"
        
    #Remove intermediate/glycanimages dir and create it again
    cmd = "rm -rf %s/glycanimages" % (path_obj["intermediate"])
    x = commands.getoutput(cmd)
    
    cmd = "mkdir %s/glycanimages" % (path_obj["intermediate"])
    x = commands.getoutput(cmd)


    seen_list = []
    data_frame = {}
    in_file = path_obj["unreviewed"] + "glycan_masterlist.csv"
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        if ac not in seen_list:
            seen_list.append(ac)
            cmd = "cp %s/%s.png %s/glycanimages/" % (images_dir, ac, path_obj["intermediate"])
            x = commands.getoutput(cmd)           

    cmd = "/usr/bin/tar -C %s/glycanimages/ -cvf %s/glycan_images.tar ./" % (path_obj["intermediate"], path_obj["unreviewed"])
    x = commands.getoutput(cmd)
    cmd = "/usr/bin/gzip %s/glycan_images.tar" % (path_obj["unreviewed"])
    x = commands.getoutput(cmd)

    return
def main():

    config_obj = json.loads(open("conf/config.json", "r").read())
    db_obj = config_obj[config_obj["server"]]["dbinfo"]
    data_dir = "unreviewed-backup/"

    field_dict = {}
    in_file = data_dir + "/field_names.csv"
    libgly.load_sheet_as_dict(field_dict, in_file, ",", "field_name_current")

    pattern = data_dir + "/*.csv"
    for in_file in glob.glob(pattern):
        file_name = in_file.split("/")[-1]
        data_frame = {}
        libgly.load_sheet(data_frame, in_file, ",")
        for j in xrange(0, len(data_frame["fields"])):
            field = data_frame["fields"][j]
            if field in field_dict["data"]:
                if field_dict["data"][field][0][0] != "":
                    data_frame["fields"][j] = field_dict["data"][field][0][0]

        out_file = "temp/%s" % (file_name)
        FW = open(out_file, "w")
        FW.write("\"%s\"\n" % ("\",\"".join(data_frame["fields"])))
        for row in data_frame["data"]:
            FW.write("\"%s\"\n" % ("\",\"".join(row)))
        FW.close()
Exemplo n.º 9
0
def extract_sequences_inchi_ds():

    glycan_list = load_glycan_masterlist()

    cid2glytoucan = {}
    data_frame = {}
    in_file = path_obj["unreviewed"] + "glycan_xref_pubchem.csv" 
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        database_id = row[f_list.index("database_id")]
        if database_id[0:3] == "CID":
            cid2glytoucan[database_id[3:]] = ac

    newrow = ["glytoucan_ac","sequence_inchi","inchi_key"]
    print "\"%s\"" % ("\",\"".join(newrow))

    data_frame = {}
    in_file = "generated/pubchem/compound/cid2inchi.csv"
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        cid = row[f_list.index("pubchem_cid")]
        inchi = row[f_list.index("inchi")]
        inchikey = row[f_list.index("inchikey")]
        if cid in cid2glytoucan:
            glytoucan_ac = cid2glytoucan[cid]
            if glytoucan_ac in glycan_list:
                newrow = [glytoucan_ac, inchi, inchikey]
                print "\"%s\"" % ("\",\"".join(newrow))


    return
def main():

    config_obj = json.loads(open("conf/config.json", "r").read())
    species_obj = config_obj["speciesinfo"]

    global path_obj
    path_obj = config_obj["pathinfo"]

    url_tmplt = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?entity_name=%s"

    data_frame = {}
    in_file = "unreviewed/human_protein_xref_hgnc.csv"
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        canon = row[f_list.index("uniprotkb_canonical_ac")]
        gene_name = row[f_list.index("database_label")]
        out_file = path_obj[
            "downloads"] + "genomics_england/panels/%s.json" % (gene_name)
        if os.path.isfile(out_file) == True:
            continue
        url = url_tmplt % (gene_name)
        res = requests.get(url, verify=False)
        if res.content.strip() != "":
            res_obj = json.loads(res.content)
            if res_obj["results"] == []:
                continue
            with open(out_file, 'w') as FW:
                FW.write("%s\n" % (res.content))
                print "downloaded json for %s " % (gene_name)
Exemplo n.º 11
0
def extract_citations_glytoucan_ds():

    black_list = get_blacklisted_pmids()

    glycan_list = load_glycan_masterlist()

    in_file = path_obj["downloads"] + "glytoucan/current/export/pubs.tsv"
    data_frame = {}
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]

    newrow = ["glytoucan_ac","pmid","title","journal_name","publication_date", "authors","source", "source_id"]
    print "\"%s\"" % ("\",\"".join(newrow))
    seen = {}
    for row in data_frame["data"]:
        glytoucan_ac = row[f_list.index("GlyTouCanAccession")]
        pmid = row[f_list.index("PubMedID")]
        source = row[f_list.index("Source")]
        source_id = row[f_list.index("SourceID")]
        if source_id.find("comp_") != -1:
            continue
        if pmid in black_list:
            continue
        combo_id = "%s %s" % (glytoucan_ac, pmid) 
        cond_list = [glytoucan_ac not in glycan_list and glytoucan_ac != ""]
        cond_list.append(pmid in ["0"])
        cond_list.append(combo_id in seen)
        if True in cond_list:
            continue
        out_file = path_obj["downloads"] + "ncbi/pubmed/medline/pmid.%s.txt" % (pmid)    
        if os.path.isfile(out_file) == True:
            obj = {}
            with open(out_file, "r") as FR:
                lcount = 0
                prev_key = ""
                for line in FR:
                    lcount += 1
                    if lcount > 3:
                        key = line[0:4].strip()
                        val = line[5:].strip()
                        if key not in obj:
                            obj[key] = []
                        if key == "":
                            obj[prev_key].append(val)
                        else:
                            obj[key].append(val)
                            prev_key = key
            if "TI" not in obj or "JT" not in obj:
                continue
            title = " ".join(obj["TI"])
            journal = " ".join(obj["JT"])
            pubdate = " ".join(obj["DP"])
            authors = ", ".join(obj["AU"])
            newrow = [glytoucan_ac, pmid, title, journal, pubdate, authors, source, source_id]
            print "\"%s\"" % ("\",\"".join(newrow))
            seen[combo_id] = True


    return
Exemplo n.º 12
0
def extract_taxonomy_ds():


    glycan_list = load_glycan_masterlist()

    newrow = ["glytoucan_ac","tax_id", "source", "source_id"]
    print "\"%s\""  % ("\",\"".join(newrow))

    seen = {}
    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/taxa.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("GlyTouCanAccession")]
        tax_id = row[f_list.index("TaxID")]
        source = row[f_list.index("Source")]
        source_id = row[f_list.index("SourceID")]
        if source_id.find("comp_") != -1:
            continue
        if ac in glycan_list:
            newrow = [ac, tax_id, source, source_id]
            newrow_str = ",".join(newrow)
            if newrow_str not in seen:
                print "\"%s\""  % ("\",\"".join(newrow))
                seen[newrow_str] = True

    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/species.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("GlyTouCanAccession")]
        evidence_type = row[f_list.index("Species")]
        evidence_desc = row[f_list.index("Value")].replace("[", "").replace("]", "").replace("'", "")
        if evidence_desc.find("TaxID") == -1:
            continue
        newrow = []
        if evidence_desc.find("GlyTouCan") != -1:
            tax_id = evidence_desc.split(" ")[-1]
            source = "GlyTouCan"
            source_id = ac
            newrow = [ac,tax_id, source, source_id]
        elif evidence_desc.find("UniCarbKB") != -1:
            tax_id = evidence_desc.split(" ")[-1]
            source = "UniCarbKB"
            source_id = evidence_desc.split(" ")[-3].split(":")[1]
            if source_id.find("comp_") == -1:
                newrow = [ac,tax_id, source, source_id]
        if ac in glycan_list and newrow != []:
            newrow_str = ",".join(newrow)
            if newrow_str not in seen:
                print "\"%s\""  % ("\",\"".join(newrow))
                seen[newrow_str] = True


    return
Exemplo n.º 13
0
def main():

    usage = "\n%prog  [options]"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-s",
                      "--species",
                      action="store",
                      dest="species",
                      help="human/mouse/rat")
    parser.add_option("-r",
                      "--release",
                      action="store",
                      dest="release",
                      help="1.0.13")

    (options, args) = parser.parse_args()
    for file in ([options.species, options.release]):
        if not (file):
            parser.print_help()
            sys.exit(0)

    species = options.species
    release = options.release

    config_obj = json.loads(
        open("/data/projects/glygen/generated/misc/dataset-masterlist.json",
             "r").read())

    for cat in ["protein", "glycan", "proteoform"]:
        ds_list = config_obj[cat]["common"]
        ds_list += config_obj[cat][species] if species in config_obj[
            cat] else []

        for ds in ds_list:
            if ds in ["allsequences", "canonicalsequences"]:
                continue
            ext = "csv"
            file_name = "%s_%s_%s.%s" % (species, cat, ds, ext)
            in_file_one = "unreviewed/%s" % (file_name)
            in_file_two = "/data/projects/glygen/releases/data/v-%s/reviewed/%s" % (
                release, file_name)
            if os.path.isfile(in_file_one) == False or os.path.isfile(
                    in_file_two) == False:
                print "%s was not found in both versions" % (file_name)
                continue
            data_frame_one, data_frame_two = {}, {}
            libgly.load_sheet(data_frame_one, in_file_one, ",")
            libgly.load_sheet(data_frame_two, in_file_two, ",")
            f_list_one = data_frame_one["fields"]
            f_list_two = data_frame_two["fields"]
            if f_list_one != f_list_two:
                set_one = set(f_list_one)
                set_two = set(f_list_two)
                diff_one = "|".join(list(set_one - set_two))
                diff_two = "|".join(list(set_two - set_one))
                print "%s,%s,%s" % (file_name, diff_one, diff_two)
def get_blacklisted_pmids(species):

    black_list = []
    in_file = "compiled/%s_protein_blacklisted_pmids.csv" % (species)
    if os.path.isfile(in_file) == True:
        data_frame = {}
        libgly.load_sheet(data_frame, in_file, ",")
        for row in data_frame["data"]:
            black_list.append(row[0])
        black_list = sorted(set(black_list))

    return black_list
Exemplo n.º 15
0
def load_canon2xref(in_file, map_dict_one, map_dict_two):

    sheet_obj = {}
    libgly.load_sheet(sheet_obj, in_file, ",")
    f_list = sheet_obj["fields"]
    for row in sheet_obj["data"]:
        map_dict_one[row[f_list.index("database_id")]] = row[f_list.index("uniprotkb_canonical_ac")]
        map_dict_two[row[f_list.index("uniprotkb_canonical_ac")]] = {
            "id":row[f_list.index("database_id")],
            "name":row[f_list.index("database_label")]
        }
    return
Exemplo n.º 16
0
def main():

    site_dict = {"glcoyslation": {}, "mutation": {}}
    site_type_dict = {"glcoyslation": {}, "mutation": {}}
    for in_file in glob.glob("unreviewed/*_protein_glycosylation_motifs.csv"):
        data_frame = {}
        libgly.load_sheet(data_frame, in_file, ",")
        f_list = data_frame["fields"]
        for row in data_frame["data"]:
            canon = row[f_list.index("uniprotkb_canonical_ac")]
            start_pos = row[f_list.index("start_pos")]
            motif = row[f_list.index("motif")]
            motif_type = motif[0]
            if canon not in site_dict["glcoyslation"]:
                site_dict["glcoyslation"][canon] = []
                site_type_dict["glcoyslation"][canon] = []
            if start_pos not in site_dict["glcoyslation"][canon]:
                site_dict["glcoyslation"][canon].append(start_pos)
                site_type_dict["glcoyslation"][canon].append(motif_type)

    for in_file in glob.glob("unreviewed/*_protein_mutation.csv"):
        data_frame = {}
        libgly.load_sheet(data_frame, in_file, ",")
        f_list = data_frame["fields"]
        for row in data_frame["data"]:
            canon = row[f_list.index("uniprotkb_canonical_ac")]
            start_pos = row[f_list.index("aa_pos")]
            if canon not in site_dict["mutation"]:
                site_dict["mutation"][canon] = []
            if start_pos not in site_dict["mutation"][canon]:
                site_dict["mutation"][canon].append(start_pos)

    newrow = [
        "uniprotkb_canonical_ac", "glycosylation_site_count",
        "mutation_site_count", "motif_types"
    ]
    print "\"%s\"" % ("\",\"".join(newrow))

    canon_list = sorted(
        list(
            set(site_dict["glcoyslation"].keys() +
                site_dict["mutation"].keys())))
    for canon in canon_list:
        newrow = [canon]
        g1, g2, m1 = "", "", ""
        if canon in site_dict["glcoyslation"]:
            g1 = str(len(site_dict["glcoyslation"][canon]))
            g2 = ";".join(
                sorted(list(set(site_type_dict["glcoyslation"][canon]))))
        if canon in site_dict["mutation"]:
            m1 = str(len(site_dict["mutation"][canon]))
        newrow = [canon, g1, m1, g2]
        print "\"%s\"" % ("\",\"".join(newrow))
Exemplo n.º 17
0
def load_glycan_masterlist():

    glycan_list = []
    data_frame = {}
    in_file = path_obj["unreviewed"] +  "glycan_masterlist.csv"
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        if ac not in glycan_list:
            glycan_list.append(ac)

    return glycan_list
Exemplo n.º 18
0
def main():

    config_obj = json.loads(open("conf/config.json", "r").read())
    species_obj = config_obj["speciesinfo"]

    cid2glytoucan = {}
    data_frame = {}
    for in_file in glob.glob("unreviewed/*_glycan_xref_pubchem.csv"):
        data_frame = {}
        libgly.load_sheet(data_frame, in_file, ",")
        f_list = data_frame["fields"]
        for row in data_frame["data"]:
            ac = row[f_list.index("glytoucan_ac")]
            database_id = row[f_list.index("database_id")]
            if database_id[0:3] == "CID":
                cid = database_id[3:]
                cid2glytoucan[cid] = ac

    seen = {}
    start = 1
    end = 25000
    n_failed = 0
    for i in xrange(0, 10000):
        start = i * 25000 + 1
        end = start + 25000 - 1
        start = "000000000"[0:-len(str(start))] + str(start)
        end = "000000000"[0:-len(str(end))] + str(end)
        in_file = "downloads/pubchem/compound/sdf/Compound_%s_%s.sdf.gz" % (
            start, end)
        out_file = "downloads/pubchem/compound/sdf4glygen/Compound_%s_%s.sdf" % (
            start, end)
        if os.path.isfile(in_file) == True:
            FW = open(out_file, "w")
            flag = False
            with gzip.open(in_file, 'rb') as FR:
                prev_line, cid, buf = "", "", ""
                for line in FR:
                    buf += line
                    line = line.strip()
                    if line == "$$$$":
                        if cid != "" and cid in cid2glytoucan:
                            flag = True
                            FW.write("%s" % (buf))
                        cid, buf = "", ""
                    if prev_line == "> <PUBCHEM_COMPOUND_CID>":
                        cid = line
                    prev_line = line
            FW.close()
            if flag == False:
                cmd = "rm -f %s" % (out_file)
                x = commands.getoutput(cmd)
Exemplo n.º 19
0
def extract_xref_chebi_ds():

    chebi2inchi = {}
    in_file = path_obj["downloads"] + "chebi/chebiId_inchi.tsv"
    with open(in_file, "r") as FR:
        for line in FR:
            parts = line.strip().split("\t")
            chebi2inchi[parts[0]] = parts[1]
    
    pubchem2inchi = {}
    in_file = "generated/pubchem/compound/cid2inchi.csv"
    data_frame = {}
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        pubchem_id = row[f_list.index("pubchem_cid")]
        inchi = row[f_list.index("inchi")]
        pubchem2inchi[pubchem_id] = inchi
    

    cid2glytoucan = {}
    data_frame = {}
    in_file = path_obj["unreviewed"] +  "glycan_xref_pubchem.csv"
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        database_id = row[f_list.index("database_id")]
        if database_id[0:3] == "CID":
            cid2glytoucan[database_id[3:]] = ac

    newrow = ["glytoucan_ac","database_id","database_label"]
    print "\"%s\""  % ("\",\"".join(newrow))
    in_file = path_obj["downloads"] + "pubchem/compound/cid2synonym.tsv"
    with open(in_file, "r") as FR:
        for line in FR:
            line = line.strip()
            cid, chebi_id = line.split("\t")
            if cid in cid2glytoucan and chebi_id[0:6] == "CHEBI:":
                if chebi_id[6:] in chebi2inchi and cid in pubchem2inchi:
                    chebi_inchi = chebi2inchi[chebi_id[6:]]
                    pubchem_inchi = pubchem2inchi[cid]
                    #newrow = [cid2glytoucan[cid], "CID" + cid, chebi_id]
                    newrow = [cid2glytoucan[cid], chebi_id.split(":")[1], "ChEBI"]
                    print "\"%s\"" % ("\",\"".join(newrow))


    return
Exemplo n.º 20
0
def extract_fully_determined_ds():

    glycan_list = load_glycan_masterlist()
    newrow = ["glytoucan_ac"]
    print "\"%s\""  % ("\",\"".join(newrow))

    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/fully_determined.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("GlyTouCanAccession")]
        if ac in glycan_list:
            newrow = [ac]
            print "\"%s\""  % ("\",\"".join(newrow))

    return
Exemplo n.º 21
0
def extract_synthesized_ds():

    glycan_list = load_glycan_masterlist()

    in_file = "compiled/glycan_synthesized.csv"
    data_frame = {}
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    newrow = f_list
    print "\"%s\""  % ("\",\"".join(newrow))
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        if ac in glycan_list:
            newrow = row
            print "\"%s\""  % ("\",\"".join(newrow))

    return
Exemplo n.º 22
0
def extract_monosaccharide_composition_ds():

    glycan_list = load_glycan_masterlist()

    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/monocomp.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    newrow = f_list
    newrow[0] = "glytoucan_ac"
    print "\"%s\"" % ("\",\"".join(newrow))

    seen = {}
    for row in data_frame["data"]:
        newrow = row
        if newrow[0] in glycan_list:
            print "\"%s\"" % ("\",\"".join(newrow))

    return
Exemplo n.º 23
0
def main():

    in_file = "downloads/unicarbkb/human29112019.csv"
    if os.path.isfile(in_file) == True:
        data_frame = {}
        libgly.load_sheet(data_frame, in_file, ",")
        f_list = data_frame["fields"]
        n_fields = len(f_list)
        flag = True
        row_count = 0
        for row in data_frame["data"]:
            row_count += 1
            n_cols = len(row)
            if n_fields != n_cols:
                flag = False
                print "Bad row, row number=%s" % (row_count)
                print in_file
                print f_list
                print row
                break
def load_glycosylation_type_two():

    data_frame = {}
    in_file = path_obj[
        "downloads"] + "glytoucan/current/export/classification.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    glytoucanac2glycosylationtype = {}
    for row in data_frame["data"]:
        glytoucan_ac = row[f_list.index("GlyTouCanAccession")].strip()
        gly_type = row[f_list.index("Type")].strip().lower()
        gly_subtype = row[f_list.index("Subtype")].strip()
        if glytoucan_ac not in glytoucanac2glycosylationtype:
            glytoucanac2glycosylationtype[glytoucan_ac] = []
        if gly_type not in glytoucanac2glycosylationtype[glytoucan_ac]:
            if gly_type == "n-linked":
                glytoucanac2glycosylationtype[glytoucan_ac].append(gly_type)
            if gly_type == "o-linked":
                glytoucanac2glycosylationtype[glytoucan_ac].append(gly_type)

    return glytoucanac2glycosylationtype
Exemplo n.º 25
0
def main():

    in_file = "/data/projects/glygen/generated/misc/dataset-masterlist.json"
    ds_obj_list = json.loads(open(in_file, "r").read())
    ntested, npassed, nfailed = 0, 0, 0
    for obj in ds_obj_list:
        ds_name = obj["name"]
        ds_format = obj["format"]
        mol = obj["categories"]["molecule"]
        if ds_format != "csv":
            continue
        file_list = []
        for species in obj["categories"]["species"]:
            f = "unreviewed/%s_%s_%s.%s" % (species, mol, ds_name, ds_format)
            file_list.append(f)
        if file_list == []:
            f = "unreviewed/%s_%s.%s" % (mol, ds_name, ds_format)
            file_list.append(f)

        for in_file in file_list:
            if os.path.isfile(in_file) == True:
                data_frame = {}
                libgly.load_sheet(data_frame, in_file, ",")
                f_list = data_frame["fields"]
                n_fields = len(f_list)
                flag = True
                for row in data_frame["data"]:
                    n_cols = len(row)
                    if n_fields != n_cols:
                        flag = False
                        print "Bad row"
                        print in_file
                        print f_list
                        print row
                        break
                ntested += 1
                npassed += 1 if flag == True else 0
                nfailed += 1 if flag == False else 0
                print "%s tested, %s passed, %s failed" % (ntested, npassed,
                                                           nfailed)
def load_glycosylation_type_one(species):

    data_frame = {}
    in_file = path_obj["downloads"] + "unicarbkb/%s_motif_current.txt" % (
        species)
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    glytoucanac2glycosylationtype = {}
    for row in data_frame["data"]:
        uckb_id = row[f_list.index("uckb_id")].strip()
        glytoucan_ac = row[f_list.index("motif_ac")].strip()
        motif_label = row[f_list.index("motif_label")].strip()
        if glytoucan_ac not in glytoucanac2glycosylationtype:
            glytoucanac2glycosylationtype[glytoucan_ac] = []
        if motif_label.lower().find("n-glycan") != -1:
            if "n-linked" not in glytoucanac2glycosylationtype[glytoucan_ac]:
                glytoucanac2glycosylationtype[glytoucan_ac].append("n-linked")
        if motif_label.lower().find("o-glycan") != -1:
            if "o-linked" not in glytoucanac2glycosylationtype[glytoucan_ac]:
                glytoucanac2glycosylationtype[glytoucan_ac].append("o-linked")

    return glytoucanac2glycosylationtype
Exemplo n.º 27
0
def extract_classification_ds():

    glycan_list = load_glycan_masterlist()

    newrow = ["glytoucan_ac","glycan_type","glycan_subtype"]
    print "\"%s\"" % ("\",\"".join(newrow))

    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/classification.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        glytoucan_ac = row[f_list.index("GlyTouCanAccession")]
        if glytoucan_ac in glycan_list:
            glycan_type = row[f_list.index("Type")]
            glycan_type = "N-glycan" if glycan_type == "N-linked" else glycan_type
            glycan_type = "O-glycan" if glycan_type == "O-linked" else glycan_type
            glycan_subtype = row[f_list.index("Subtype")]
            newrow = [glytoucan_ac, glycan_type, glycan_subtype]
            print "\"%s\"" % ("\",\"".join(newrow))


    return
Exemplo n.º 28
0
def extract_masterlist_ds():

    seen_list = []
    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/allglycan.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("GlyTouCanAccession")]
        if ac not in seen_list:
            seen_list.append(ac)

    newrow = [
        "glytoucan_ac","glytoucan_type","glycan_mass", "glycan_permass",
        "base_composition","composition","topology","monosaccharides"
    ]
    print "\"%s\"" % ("\",\"".join(newrow))

    data_frame = {}
    in_file = path_obj["downloads"] + "glytoucan/current/export/glycan_properties.tsv"
    libgly.load_sheet(data_frame, in_file, "\t")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        newrow = [
            row[f_list.index("glytoucan_acc")],
            row[f_list.index("glytoucan_type")],
            row[f_list.index("glycan_mass")],
            row[f_list.index("glycan_permass")],
            row[f_list.index("base_composition")],
            row[f_list.index("composition")],
            row[f_list.index("topology")],
            row[f_list.index("monosaccharides")]
        ]
        if newrow[0] in seen_list:
            print "\"%s\"" % ("\",\"".join(newrow))

    return
Exemplo n.º 29
0
def extract_sequences_ds(seq_format):

    newrow = ["glytoucan_ac","sequence_%s" % (seq_format)]
    print "\"%s\"" % ("\",\"".join(newrow))

    seen_list = []
    data_frame = {}
    in_file = path_obj["unreviewed"] +  "glycan_masterlist.csv"
    libgly.load_sheet(data_frame, in_file, ",")
    f_list = data_frame["fields"]
    for row in data_frame["data"]:
        ac = row[f_list.index("glytoucan_ac")]
        if ac not in seen_list:
            seen_list.append(ac)
            in_file = path_obj["downloads"] + "glytoucan/current/export/%s/%s.txt" % (seq_format, ac)
            if os.path.isfile(in_file) == True:
                with open(in_file, "r") as FR:
                    seq = ""
                    for line in FR:
                        seq += " " + line.strip()
                    newrow = [ac, seq.strip()]
                    print "\"%s\"" % ("\",\"".join(newrow))

    return
def main():

    config_obj = json.loads(open("conf/config.json", "r").read())
    db_obj = config_obj[config_obj["server"]]["dbinfo"]

    field_dict = {}
    in_file = "unreviewed/field_names.csv"
    libgly.load_sheet(field_dict, in_file, ",")

    field_list = []
    for row in field_dict["data"]:
        field = row[1] if row[1] != "" else row[0]
        field_list.append(field)

    pattern = "temp/*.csv"
    for in_file in glob.glob(pattern):
        file_name = in_file.split("/")[-1]
        if file_name == "field_names.csv":
            continue
        data_frame = {}
        libgly.load_sheet(data_frame, in_file, ",")
        for field in data_frame["fields"]:
            if field not in field_list:
                print "undefined", field, file_name