示例#1
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    rec_ids = args.records
    infile = args.infile
    prop_name = args.property
    prop_val = args.value
    # Connect to the Portal
    dcc_mode = args.dcc_mode
    if dcc_mode:
        conn = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()
    if not rec_ids:
        # Then get them from input file
        fh = open(infile)
        for line in fh:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            rec_ids.append(line)
    e = ""
    patch_cnt = 0
    for i in rec_ids:
        try:
            conn.patch({prop_name: prop_val, conn.ENCID_KEY: i})
            patch_cnt += 1
        except Exception as e:
            if e.response.status_code == 422:  # Unprocessable Entity
                # Then likely this property is not defined for this record.
                text = json.loads(e.response.text)
                print("Can't PATCH record {}: {}".format(i, text["errors"]))
    print("Finished: PATCHED {} records.".format(
        str(patch_cnt) + "/" + str(len(rec_ids))))
def main():
    parser = get_parser()
    args = parser.parse_args()
    rec_ids = args.records
    infile = args.infile
    outfile = args.outfile
    # Connect to the Portal
    dcc_mode = args.dcc_mode
    if dcc_mode:
        conn = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()
    if not rec_ids:
        # Then get them from input file
        fh = open(infile)
        for line in fh:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            rec_ids.append(line)
    gms = []
    for i in rec_ids:
        rec = conn.get(i)
        for rep in rec["replicates"]:
            for gm in rep["library"]["biosample"]["genetic_modifications"]:
                gm = gm.strip("/").split("/")[-1]
                if gm not in gms:
                    print(gm)
                    gms.append(gm)
    fout = open(outfile, "w")
    for i in gms:
        fout.write(i)
    fout.close()
示例#3
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    profile_id = args.profile_id
    dcc_mode = args.dcc_mode
    dry_run = args.dry_run
    no_aliases = args.no_aliases
    overwrite_array_values = args.overwrite_array_values

    if dcc_mode:
        conn = euc.Connection(dcc_mode, dry_run)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()

    # Put conn into submit mode:
    conn.set_submission(True)
    infile = args.infile
    patch = args.patch
    gen = create_payloads(profile_id=profile_id, infile=infile)
    for payload in gen:
        if not patch:
            conn.post(payload, require_aliases=not no_aliases)
        else:
            record_id = payload.get(RECORD_ID_FIELD, False)
            if not record_id:
                raise Exception(
                    "Can't patch payload {} since there isn't a '{}' field indiciating an identifer for the record to be PATCHED.".format(
                        euu.print_format_dict(payload), RECORD_ID_FIELD))
            payload.pop(RECORD_ID_FIELD)
            payload.update({conn.ENCID_KEY: record_id})
            conn.patch(payload=payload, extend_array_values=not overwrite_array_values)
示例#4
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    desc = args.description
    aws_creds = args.s3creds
    if aws_creds:
        aws_creds = aws_creds.split(":")
    # Connect to the Portal
    dcc_mode = args.dcc_mode
    if dcc_mode:
        conn = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()

    file_ids = args.file_ids
    infile = args.infile
    if infile:
        fh = open(infile)
        for line in fh:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            file_ids.append(line)
        fh.close()

    gcp_bucket = args.gcpbucket
    gcp_project = args.gcpproject
    conn.gcp_transfer_from_aws(file_ids=file_ids,
                               gcp_bucket=gcp_bucket,
                               gcp_project=gcp_project,
                               description=desc,
                               aws_creds=aws_creds)
示例#5
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    if args.rm_patch and not args.remove_property:
        parser.error("No properties to remove were specified. Use --patch if only patching is needed.")
    if args.remove_property and not args.rm_patch:
        parser.error("Properties to remove were specified, but --rm-patch flag was not set.")

    profile_id = args.profile_id
    dcc_mode = args.dcc_mode
    dry_run = args.dry_run
    no_aliases = args.no_aliases
    overwrite_array_values = args.overwrite_array_values

    if dcc_mode:
        conn = euc.Connection(dcc_mode, dry_run)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()

    # Put conn into submit mode:
    conn.set_submission(True)

    schema = conn.profiles.get_profile_from_id(profile_id)
    infile = args.infile
    patch = args.patch
    rmpatch = args.rm_patch
    if args.remove_property is not None:
        props_to_remove = args.remove_property.split(",")

    gen = create_payloads(schema=schema, infile=infile)
    for payload in gen:
        if not patch and not rmpatch:
            conn.post(payload, require_aliases=not no_aliases)
        elif rmpatch:
            record_id = payload.get(RECORD_ID_FIELD, False)
            if not record_id:
                raise ValueError(
                    "Can't patch payload {} since there isn't a '{}' field indicating an identifier for the record to be PATCHED.".format(
                        euu.print_format_dict(payload), RECORD_ID_FIELD))
            payload.pop(RECORD_ID_FIELD)
            payload.update({conn.ENCID_KEY: record_id})
            conn.remove_and_patch(props=props_to_remove, patch=payload, extend_array_values=not overwrite_array_values)
        elif patch:
            record_id = payload.get(RECORD_ID_FIELD, False)
            if not record_id:
                raise ValueError(
                    "Can't patch payload {} since there isn't a '{}' field indicating an identifier for the record to be PATCHED.".format(
                        euu.print_format_dict(payload), RECORD_ID_FIELD))
            payload.pop(RECORD_ID_FIELD)
            payload.update({conn.ENCID_KEY: record_id})
            conn.patch(payload=payload, extend_array_values=not overwrite_array_values)
示例#6
0
def main():
    conn = euc.Connection("prod")
    reg = re.compile("_R2_001")
    parser = get_parser()
    args = parser.parse_args()
    ids = []
    fh = open(args.infile)
    for line in fh:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        ids.append(line)
    for i in ids:
        h = conn.get_fastqfile_replicate_hash(exp_id=i)
        for bio_rep in h:
            for tech_rep in h[bio_rep]:
                read_files = h[bio_rep][tech_rep].get(2)
                # read_files is a list of file objects
                if not read_files:
                    continue
                for r in read_files:
                    aliases = r["aliases"]
                    for a in aliases:
                        match = reg.search(a)
                        if match:
                            paired_with_name = a.replace(
                                reg.pattern, "_R1_001")
                            payload = {conn.ENCID_KEY: a}
                            payload["paired_with"] = paired_with_name
                            try:
                                conn.patch(payload=payload)
                            except Exception:
                                break
                            break
示例#7
0
 def test_dry_run_enabled(self):
     """
     Tests the method ``check_dry_run`` for returning True when the ``Connection`` class is
     instantiated in dry-run mode.
     """
     self.conn = connection.Connection(eu.DCC_DEV_MODE,True) 
     self.assertEqual(True, self.conn.check_dry_run())
示例#8
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    dcc_mode = args.dcc_mode
    limit = args.limit
    outfile = args.outfile
    url = args.url

    if dcc_mode:
        conn = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()

    fout = open(outfile, "w")
    results = conn.search(limit=limit, url=url) #returns a list of search results
    fout.write(json.dumps(results))
    fout.close()
示例#9
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    url = args.url
    infile = args.infile
    if url:
        conn = euc.Connection("prod")
        results = conn.search(url=url)
    else:
        fh = open(infile)
        results = json.load(fh)
    admin = models.User.find_by({"first_name": "Admin"})
    if not admin:
        raise Exception("Could not find the Admin user in the database, which is needed for associating with new records.")
    created = 0
    patched = 0
    total = 0
    for rec in results:
        patch = False
        total += 1
        organism = rec["organism"]["scientific_name"]
        if organism != SPECIES:
          continue
        payload = {}
        label = rec["label"]
        payload["name"] = label
        # Check if the target already exists in the database.
        pulsar_record = models.Target.find_by({"name": label})
        upstream = rec["@id"].strip("/").split("/")[-1]
        if pulsar_record and upstream != pulsar_record["upstream_identifier"]:
            patch = True
        elif pulsar_record:
            continue # Can add support for patch operation later. 
        payload["upstream_identifier"] = upstream
        payload["user_id"] = admin["id"]
        xrefs = rec["dbxref"]
        for ref in xrefs:
            tokens = ref.split(":")
            prefix, ref = ref.rsplit(":", 1)
            if prefix == "ENSEMBL":
                payload["ensembl"] = ref
            elif prefix == "UniProtKB":
                payload["uniprotkb"] = ref
            elif prefix == "RefSeq":
                payload["refseq"] = ref
        print("Creating {}".format(payload))
        if patch:
            target = models.Target(pulsar_record["id"])
            target.patch(payload)
            patched += 1
            print("Patched: {}".format(patched))
        else:
            models.Target.post(payload)
            created += 1
            print("Created: {}".format(created))
        print("Total processed: {}".format(total))
示例#10
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    url = args.url
    conn = euc.Connection("prod")
    results = conn.search(url=url)
    for i in results:
        dcc_exp = conn.get(i["@id"])
        for rep in dcc_exp["replicates"]:
            biosample_id = rep["library"]["biosample"]["@id"]
            bp.biosample(rec_id=biosample_id)
示例#11
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    desc = args.description
    aws_creds = args.s3creds
    if aws_creds:
        aws_creds = aws_creds.split(":")
    # Connect to the Portal
    dcc_mode = args.dcc_mode
    if dcc_mode:
        conn = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()

    file_ids = args.file_ids
    gcp_bucket = args.gcpbucket
    gcp_project = args.gcpproject
    conn.gcp_transfer(file_ids=file_ids,
                      gcp_bucket=gcp_bucket,
                      gcp_project=gcp_project,
                      description=desc,
                      aws_creds=aws_creds)
示例#12
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    file_ids = args.file_ids
    infile = args.infile
    # Connect to the Portal
    dcc_mode = args.dcc_mode
    if dcc_mode:
        conn = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()
    if infile:
        fh = open(infile)
        for line in fh:
            line = line.strip()
            if not line:
                continue
            file_ids.append(line)
        fh.close()
    for f in file_ids:
        print("Generating upload credentials for File record '{}'.".format(f))
        conn.regenerate_aws_upload_creds(file_id=f) 
示例#13
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    outfile = args.outfile
    # Connect to the Portal
    dcc_mode = args.dcc_mode
    if dcc_mode:
        conn = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        conn = euc.Connection()

    file_ids = args.file_ids
    infile = args.infile
    if infile:
        fh = open(infile)
        for line in fh:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            file_ids.append(line)
        fh.close()

    conn.gcp_transfer_urllist(file_ids=file_ids, filename=outfile)
#!/usr/bin/env python3

###
# Nathaniel Watson
# [email protected]
# 2019-02-25
###
import encode_utils.connection as euc
conn = euc.Connection("prod")

dico = {}
# All released antibodies from Snyder's team that are 'characterized to standards', 'partially characterized',
# or 'characterized to standards with exemption' in human datasets.
url = "https://www.encodeproject.org/search/?type=AntibodyLot&status=released&lot_reviews.status=characterized+to+standards&targets.organism.scientific_name=H**o+sapiens&characterizations.lab.title=Michael+Snyder%2C+Stanford&lot_reviews.status=partially+characterized&lot_reviews.status=characterized+to+standards+with+exemption"

# An AntibodyLot has many Characterizations - one for each target (gene).
# A Characterization has many characterization_reviews, one for each cell line, primary cell, tissue, etc.
results = conn.search(url=url)
for i in results:
    ab = conn.get(i["@id"])
    chars = ab["characterizations"]
    for char in chars:
        target = char["target"]["label"]
        try:
            reviews = char["characterization_reviews"]
        except KeyError:
            continue
        for r in reviews:
            lane_status = r["lane_status"]
            if not lane_status == "compliant":
                continue
示例#15
0
import json
import pdb
import re

import encode_utils.connection as euc
import encode_utils.utils as euu
import pulsarpy.models as models
import pulsarpy.utils

protocol_regx = re.compile(r'protocol', re.IGNORECASE)

ACCESSION_PROP = "accession"
ALIASES_PROP = "aliases"
UPSTREAM_PROP = "upstream_identifier"
UUID_PROP = "uuid"
ENC_CONN = euc.Connection("prod")
#ADMIN_USER_ID = models.User.find_by({"email": "*****@*****.**"})["id"]
ADMIN_USER_ID = 1

# Biosamples to import for Jessika:
# https://www.encodeproject.org/search/?type=Biosample&lab.title=Michael+Snyder%2C+Stanford&award.rfa=ENCODE4&biosample_type=tissue
# which sum up to the following 30 accessions:
#['ENCBS443KFH', 'ENCBS251BGN', 'ENCBS558OUC', 'ENCBS319AUC', 'ENCBS895UDJ', 'ENCBS303LEQ', 'ENCBS208HYM', 'ENCBS273ENZ', 'ENCBS704NZI', 'ENCBS444XVA', 'ENCBS924ALU', 'ENCBS892KPS', 'ENCBS729ENA', 'ENCBS268DOV', 'ENCBS157OMX', 'ENCBS441BYJ', 'ENCBS858UHJ', 'ENCBS577DQE', 'ENCBS655YSD', 'ENCBS064HXH', 'ENCBS790AKV', 'ENCBS437TFK', 'ENCBS465UNR', 'ENCBS278ABD', 'ENCBS131SFJ', 'ENCBS605RWM', 'ENCBS722FKO', 'ENCBS603AMH', 'ENCBS222RAG', 'ENCBS649EIC']


def set_name(rec):
    """
    Most of the models in Pulsar have a name attribute, and most of the time it is required.
    When backporting a record from the ENCODE Portal, we need some value to use as the record's name,
    and records in the Portal don't have a name prop, so we need to use some other propery value.
示例#16
0
def main():
    global CONN
    parser = get_parser()
    args = parser.parse_args()
    exp_id = args.exp
    outdir = args.outdir
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    dcc_mode = args.dcc_mode

    if dcc_mode:
        CONN = euc.Connection(dcc_mode)
    else:
        # Default dcc_mode taken from environment variable DCC_MODE.
        CONN = euc.Connection()

    exp = CONN.get(exp_id)

    # Open output file handles
    # experiments file
    exp_file = os.path.join(outdir, EXP_TAB)
    expfh = open(exp_file, "a")
    if file_is_empty(exp_file):
        expfh.write("\t".join(EXP_HEADER) + "\n")
    # files file
    file_file = os.path.join(outdir, FILE_TAB)
    ffh = open(file_file, "a")
    if file_is_empty(file_file):
        ffh.write("\t".join(FILE_HEADER) + "\n")
    # replicates file
    rep_file = os.path.join(outdir, REP_TAB)
    repfh = open(rep_file, "a")
    if file_is_empty(rep_file):
        repfh.write("\t".join(REP_HEADER) + "\n")
    # genetic modifications file
    gm_file = os.path.join(outdir, GM_TAB)
    gmfh = open(gm_file, "a")
    if file_is_empty(gm_file):
        gmfh.write("\t".join(GM_HEADER) + "\n")
    # biosamples file
    bio_file = os.path.join(outdir, BIO_TAB)
    biofh = open(bio_file, "a")
    if file_is_empty(bio_file):
        biofh.write("\t".join(BIO_HEADER) + "\n")
    # libraries file
    lib_file = os.path.join(outdir, LIB_TAB)
    libfh = open(lib_file, "a")
    if file_is_empty(lib_file):
        libfh.write("\t".join(LIB_HEADER) + "\n")

    expfh.write("\t") # emtpy for name field in Pulsar
    expfh.write(exp["accession"] + "\t")
    exp_alias = exp["aliases"][0]
    expfh.write(exp_alias + "\t")
    expfh.write(exp["description"] + "\t")
    expfh.write(exp["target"]["name"] + "\t")
    document_aliases = portal_ids_to_aliases(exp["documents"])
    expfh.write(",".join(document_aliases) + "\t")
    submitter_comments = exp.get("submitter_comment", "")
    expfh.write(submitter_comments + "\t")
    expfh.write("\t") # empty for notes field in Pulsar
    expfh.write("\n")
    # START FILE FILE
    fastq_files = CONN.get_fastqfiles_on_exp(exp_id)
    for f in fastq_files:
        ffh.write("\t") # empty for name field in Pulsar
        ffh.write(f["accession"] + "\t")
        ffh.write(f["aliases"][0] + "\t")
        platform = f["platform"]["aliases"][-1]
        ffh.write(platform + "\t")
        ffh.write(f.get("submitted_file_name", "") + "\t")
        rep = f["replicate"]
        ffh.write(rep.get("uuid", "") + "\t")
        ffh.write(rep["aliases"][0] + "\t")
        ffh.write(f.get("run_type", "") + "\t")
        controlled_by = f.get("controlled_by", [])
        ffh.write(",".join(controlled_by) + "\t")
        ffh.write(f.get("paired_with", "") + "\t")
        ffh.write(f.get("paired_end", "") + "\t")
        ffh.write(str(f.get("read_count", "")) + "\t")
        ffh.write(str(f.get("read_length", "")) + "\t")
        fc = f.get("flowcell_details", {})
        if fc:
            fc = fc[0]
        ffh.write(fc.get("barcode", "") + "\t")
        ffh.write(fc.get("machine", "") + "\t")
        ffh.write(str(fc.get("lane", "")) + "\t")
        ffh.write("\n")

    # START REPLICATE FILE
    reps = exp["replicates"]
    for i in reps:
        repfh.write("\t") # empty for name field in Pulsar
        repfh.write(i["aliases"][0] + "\t")
        repfh.write(exp_alias + "\t")
        lib = i["library"]
        bio = lib["biosample"]
        repfh.write("\t") # empty for biosample_id fkey field in Pulsar
        biosample_alias = bio["aliases"][0]
        repfh.write(biosample_alias + "\t")
        repfh.write(str(i["biological_replicate_number"]) + "\t")
        repfh.write(str(i["technical_replicate_number"]) + "\t")
        repfh.write("\t") # empty for antibody_id fkey field in Pulsar
        antibody = i.get("antibody", "")
        if antibody:
            repfh.write(antibody["accession"] + "\t")
        else:
            repfh.write("\t")
        repfh.write(i.get("submitter_comment", "") + "\t")
        repfh.write("\t") # empty for notes field in Pulsar
        repfh.write("\n")
        # START BIOSAMPLE FILE
        biofh.write("\t") # empty for name field in Pulsar
        biofh.write(bio["accession"] + "\t") 
        biosample_upstream_id = bio["aliases"][0]
        biofh.write(biosample_upstream_id + "\t")
        biofh.write(bio.get("part_of", "") + "\t")
        biofh.write(bio.get("nih_institutional_certification", "") + "\t")
        pooled_from = bio.get("pooled_from", [])
        biofh.write(",".join(pooled_from) + "\t")
        treatment_dicts = bio.get("treatments", {})
        treatment_uuids = [x["uuid"] for x in treatment_dicts]
        treatment_aliases = portal_ids_to_aliases(treatment_uuids)
        biofh.write(",".join(treatment_aliases) + "\t")
        document_aliases = portal_ids_to_aliases(bio.get("documents", []))
        biofh.write(",".join(document_aliases) + "\t")
        biofh.write(bio["biosample_type"] + "\t")
        biofh.write(bio["biosample_term_name"] + "\t")
        biofh.write(bio["source"]["name"] + "\t")
        biofh.write(bio.get("product_id", "") + "\t")
        biofh.write(bio.get("lot_id", "") + "\t")
        biofh.write(bio["donor"]["aliases"][0] + "\t")
        biofh.write(bio.get("passage_number", "") + "\t")
        date_taken = bio.get("culture_start_date", "")
        if not date_taken:
            date_taken = bio.get("date_obtained", "")
        biofh.write(date_taken + "\t")
        biofh.write(bio.get("submitter_comment", "") + "\t")
        biofh.write("\t") # empty for notes field in Pulsar
        biofh.write("\n")
        # update gm file
        for gm_id in bio.get("genetic_modifications", []):
            gm = CONN.get(gm_id)
            gmfh.write("\t") # empty for name field in Pulsar
            gmfh.write(gm["accession"] + "\t")
            gmfh.write(gm["aliases"][0] + "\t")
            gmfh.write(biosample_upstream_id + "\t")
            gmfh.write(gm.get("description", "") + "\t")
            document_aliases = portal_ids_to_aliases(gm.get("documents", []))
            gmfh.write(",".join(document_aliases) + "\t")
            gmfh.write(gm.get("category", "") + "\t")
            gmfh.write(gm.get("purpose", "") + "\t")
            gmfh.write(gm.get("method", "") + "\t")
            guide_seqs = gm.get("guide_rna_sequences", [])
            gmfh.write(",".join(guide_seqs) + "\t")
            tags = gm.get("introduced_tags", [])
            gmfh.write(str(tags) + "\t")
            reagents = gm.get("reagents")
            gmfh.write(str(reagents) + "\t")
            chars = gm.get("characterizations", [])
            gmfh.write(",".join(chars) + "\t")
            gmfh.write("\t") # empty for crispr_construct_ids in Pulsar
            gmfh.write("\t") # empty for donor_construct_id in Pulsar
            gmfh.write("\t") # empty for notes field in pulsar
            gmfh.write("\n")
        # START LIBRARY FILE
        libfh.write("\t") # empty for name field in Pulsar
        libfh.write(lib["accession"] + "\t")
        libfh.write(lib["aliases"][0] + "\t")
        libfh.write(biosample_upstream_id + "\t")
        libfh.write(lib["nucleic_acid_term_name"] + "\t")
        strand_specific = str(lib.get("strand_specificity", False))
        libfh.write(strand_specific + "\t")
        document_aliases = portal_ids_to_aliases(lib["documents"])
        libfh.write(",".join(document_aliases) + "\t")
        libfh.write(lib.get("size_range", "") + "\t")
        treatment_aliases = portal_ids_to_aliases(lib["treatments"])
        libfh.write(",".join(treatment_aliases) + "\t")
        libfh.write(lib.get("source", "") + "\t")
        libfh.write(lib.get("product_id", "") + "\t")
        libfh.write(lib.get("lot_id", "") + "\t")
        libfh.write(lib.get("fragmentation_method", "") + "\t")
        libfh.write("\t") # empty for sequencing_library_prep_kit_id field in Pulsar
        libfh.write("\t") # empty for paired_end field in Pulsar
        libfh.write("\t") # empty for barcode_id in Pulsar
        libfh.write("\t") # empty for paired_barcode_id in Pulsar
  
        libfh.write(lib.get("submitter_comment", "") + "\t")
        libfh.write("\t") # empty for notes field in pulsar
        libfh.write("\n")

    expfh.close()
    ffh.close()
    repfh.close()
    biofh.close()
    libfh.close()
    gmfh.close()
示例#17
0
 def test_arbitrary_host(self):
     self.conn = connection.Connection(dcc_mode='test.encodedcc.org')
示例#18
0
 def setUp(self):
     self.conn = connection.Connection(eu.DCC_DEV_MODE)