def set_up(self):
     """do some setup for script"""
     if self.UPDATE:
         print("WARNING: This run is an UPDATE run objects will be released.")
     else:
         print("Object status will be checked but not changed")
     if self.FORCE:
         print("WARNING: Objects that do not pass audit will be FORCE-released")
     if self.LOGALL:
         print("Logging all statuses")
     if self.infile:
         if os.path.isfile(self.infile):
             self.ACCESSIONS = [line.rstrip("\n") for line in open(self.infile)]
         else:
             self.ACCESSIONS = self.infile.split(",")
     elif self.QUERY:
         if "search" in self.QUERY:
             temp = encodedcc.get_ENCODE(self.QUERY, self.connection).get("@graph", [])
         else:
             temp = [encodedcc.get_ENCODE(self.QUERY, self.connection)]
         if any(temp):
             for obj in temp:
                 if obj.get("accession"):
                     self.ACCESSIONS.append(obj["accession"])
                 elif obj.get("uuid"):
                     self.ACCESSIONS.append(obj["uuid"])
                 elif obj.get("@id"):
                     self.ACCESSIONS.append(obj["@id"])
                 elif obj.get("aliases"):
                     self.ACCESSIONS.append(obj["aliases"][0])
     if len(self.ACCESSIONS) == 0:
         # if something happens and we end up with no accessions stop
         print("ERROR: object has no identifier", file=sys.stderr)
         sys.exit(1)
예제 #2
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.object:
        if os.path.isfile(args.object):
            accessions = [line.strip() for line in open(args.object)]
        else:
            accessions = args.object.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                elif obj.get("aliases"):
                    accessions.append(obj["aliases"][0])
                else:
                    print("ERROR: object has no identifier", file=sys.stderr)
    if len(accessions) == 0:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    for acc in accessions:
        encodedcc.get_ENCODE(acc, connection)
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.infile:
        accessions = [line.rstrip("\n") for line in open(args.infile)]
    elif args.query:
        data = encodedcc.get_ENCODE(args.query, connection).get("@graph", [])
        for exp in data:
            files = exp.get("files", [])
            for f in files:
                res = encodedcc.get_ENCODE(f, connection)
                f_type = res.get("file_format", "")
                if f_type == "fastq":
                    accessions.append(res["accession"])
    elif args.accession:
        accessions = [args.accession]
    else:
        print("No accessions to check")
        sys.exit(1)
    for acc in accessions:
        link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz"
        for header, sequence, qual_header, quality in encodedcc.fastq_read(connection, uri=link):
            if args.header:
                header = header.decode("UTF-8")
                print(header)
            else:
                sequence = sequence.decode("UTF-8")
                print(acc + "\t" + str(len(sequence)))
예제 #4
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.object:
        if os.path.isfile(args.object):
            accessions = [line.strip() for line in open(args.object)]
        else:
            accessions = args.object.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query,
                                        connection).get("@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                elif obj.get("aliases"):
                    accessions.append(obj["aliases"][0])
                else:
                    print("ERROR: object has no identifier", file=sys.stderr)
    if len(accessions) == 0:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    for acc in accessions:
        encodedcc.get_ENCODE(acc, connection)
예제 #5
0
    def process_link(self, identifier_link, approved_types):
        item = identifier_link.split("/")[1].replace("-", "")
        subobj = encodedcc.get_ENCODE(identifier_link, self.connection)
        subobjname = subobj["@type"][0]
        restricted_flag = False
        inactive_pipeline_flag = False

        if (item in self.profiles_ref) and \
           (identifier_link not in self.searched):
            if (subobjname == 'File'):
                if self.is_restricted(subobj) is True:
                    print(subobj['@id'] + ' is restricted, ' +
                          'therefore will not be released')
                    restricted_flag = True
                    self.searched.append(subobj["@id"])
                if subobj.get('analysis_step_version'):
                    p = self.has_inactive_pipeline(
                        encodedcc.get_ENCODE(identifier_link, self.connection,
                                             "embedded"))
                    if p:
                        print('{} is only associated with inactive pipelines'
                              ' and therefore will not be released: {}'.format(
                                  subobj['@id'], p))
                        inactive_pipeline_flag = True
                        self.searched.append(subobj["@id"])
            # expand subobject
            if (subobjname in approved_types) and \
               (restricted_flag is False) and \
               (inactive_pipeline_flag is False):
                self.get_status(
                    subobj,
                    hi.dictionary_of_lower_levels.get(
                        hi.levels_mapping.get(subobjname)))
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    profiles = encodedcc.get_ENCODE('/profiles/', connection)
    for object_type in profiles.keys():
        profile_properties = encodedcc.get_ENCODE(
            '/profiles/' + object_type, connection).get('properties')
        # we should fix only objects that have alternate accessions property
        if profile_properties and profile_properties.get(
                'alternate_accessions'):
            uuid_2_alternate_accessions = {}
            objects = encodedcc.get_ENCODE('search/?type=' + object_type,
                                           connection)['@graph']
            for entry in objects:
                if entry.get('alternate_accessions'):
                    replaced_objects_accessions = []
                    for acc in entry.get('alternate_accessions'):
                        replaced_objects_accessions.extend(
                            retreive_list_of_replaced(acc,
                                                      connection))
                    if sorted(list(set(
                        replaced_objects_accessions))) != sorted(
                       entry.get('alternate_accessions')):
                        uuid_2_alternate_accessions[entry['uuid']] = \
                            set(replaced_objects_accessions)

            for uuid in uuid_2_alternate_accessions.keys():
                uuid_sets_counter = 0
                for key in uuid_2_alternate_accessions.keys():
                    if uuid_2_alternate_accessions[uuid] <= \
                       uuid_2_alternate_accessions[key]:
                        uuid_sets_counter += 1
                if uuid_sets_counter == 1:
                    for acc in list(uuid_2_alternate_accessions[uuid]):
                        to_clean_objects = encodedcc.get_ENCODE(
                            'search/?type=Item&accession=' + acc,
                            connection)['@graph']
                        for object_to_clean in to_clean_objects:
                            print(object_to_clean['uuid'] +
                                  ' alternate accessions list ' +
                                  str(object_to_clean[
                                      'alternate_accessions']) +
                                  ' is removed')
                            encodedcc.patch_ENCODE(
                                object_to_clean['uuid'],
                                connection,
                                {"alternate_accessions": []})

                    print(uuid + ' is patched with ' +
                          str({"alternate_accessions": list(
                              uuid_2_alternate_accessions[uuid])}))
                    encodedcc.patch_ENCODE(
                        uuid,
                        connection,
                        {"alternate_accessions": list(
                            uuid_2_alternate_accessions[uuid])})
 def check_ENCODE(self, idList, connection, otherIdList=[], bothDicts={}):
     for pmid in idList:
         extraData = bothDicts.get(pmid)
         ENCODEvalue = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=PMID:" + pmid, connection)
         if ENCODEvalue.get("@graph"):
             log = "PMID " + pmid + " is listed in ENCODE"
             logger.info('%s' % log)
             uuid = ENCODEvalue.get("@graph")[0].get("uuid")
             if not self.CREATE_ONLY:
                 self.compare_entrez_ENCODE(uuid, pmid, connection, extraData)
         else:
             if self.CREATE_ONLY:
                 self.get_entrez([pmid])
             titleEntrez = self.entrezDict[pmid].get("title")
             found = False
             for otherID in otherIdList:
                 titleENCODE = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=" + otherID, connection)
                 if titleENCODE.get("title") == titleEntrez:
                     log = pmid + " is in ENCODE by a different name " + titleENCODE.get("uuid")
                     logger.warning('%s' % log)
                     self.compare_entrez_ENCODE(titleENCODE.get("uuid"), pmid, connection, extraData)
                     if self.UPDATE:
                         newIdent = titleENCODE.get("identifiers")
                         newIdent.append("PMID:" + pmid)
                         patch_dict = {"identifiers": newIdent}
                         encodedcc.patch_ENCODE(titleENCODE.get("uuid"), connection, patch_dict)
                     found = True
             if found is False:
                 log = "This publication is not listed in ENCODE " + pmid
                 logger.warning('%s' % log)
                 if self.CREATE:
                     self.POST_COUNT += 1
                     pmidData = self.entrezDict[pmid]
                     log = "POSTing the new object: " + pmid
                     logger.info('%s' % log)
                     post_dict = {
                         "title": pmidData.get("title"),
                         "abstract": pmidData.get("abstract"),
                         "submitted_by": "/users/8b1f8780-b5d6-4fb7-a5a2-ddcec9054288/",
                         "lab": "/labs/encode-consortium/",
                         "award": "/awards/ENCODE/",
                         "categories": extraData.get("categories"),
                         "published_by": extraData.get("published_by"),
                         "date_published": pmidData.get("date_published"),
                         "authors": pmidData.get("authors"),
                         "identifiers": ["PMID:" + pmid],
                         "journal": pmidData.get("journal"),
                         "volume": pmidData.get("volume"),
                         "issue": pmidData.get("issue"),
                         "page": pmidData.get("page"),
                         "status": "published"
                     }
                     if extraData.get("data_used"):
                         post_dict["data_used"] = extraData.get("data_used")
                     encodedcc.new_ENCODE(connection, "publications", post_dict)
 def check_ENCODE(self, idList, connection, otherIdList=[], bothDicts={}):
     for pmid in idList:
         extraData = bothDicts.get(pmid)
         ENCODEvalue = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=PMID:" + pmid, connection)
         if ENCODEvalue.get("@graph"):
             log = "PMID " + pmid + " is listed in ENCODE"
             logger.info('%s' % log)
             uuid = ENCODEvalue.get("@graph")[0].get("uuid")
             if not self.CREATE_ONLY:
                 self.compare_entrez_ENCODE(uuid, pmid, connection, extraData)
         else:
             if self.CREATE_ONLY:
                 self.get_entrez([pmid])
             titleEntrez = self.entrezDict[pmid].get("title")
             found = False
             for otherID in otherIdList:
                 titleENCODE = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=" + otherID, connection)
                 if titleENCODE.get("title") == titleEntrez:
                     log = pmid + " is in ENCODE by a different name " + titleENCODE.get("uuid")
                     logger.warning('%s' % log)
                     self.compare_entrez_ENCODE(titleENCODE.get("uuid"), pmid, connection, extraData)
                     if self.UPDATE:
                         newIdent = titleENCODE.get("identifiers")
                         newIdent.append("PMID:" + pmid)
                         patch_dict = {"identifiers": newIdent}
                         encodedcc.patch_ENCODE(titleENCODE.get("uuid"), connection, patch_dict)
                     found = True
             if found is False:
                 log = "This publication is not listed in ENCODE " + pmid
                 logger.warning('%s' % log)
                 if self.CREATE:
                     self.POST_COUNT += 1
                     pmidData = self.entrezDict[pmid]
                     log = "POSTing the new object: " + pmid
                     logger.info('%s' % log)
                     post_dict = {
                         "title": pmidData.get("title"),
                         "abstract": pmidData.get("abstract"),
                         "submitted_by": "/users/8b1f8780-b5d6-4fb7-a5a2-ddcec9054288/",
                         "lab": "/labs/encode-consortium/",
                         "award": "/awards/ENCODE/",
                         "categories": extraData.get("categories"),
                         "published_by": extraData.get("published_by"),
                         "date_published": pmidData.get("date_published"),
                         "authors": pmidData.get("authors"),
                         "identifiers": ["PMID:" + pmid],
                         "journal": pmidData.get("journal"),
                         "volume": pmidData.get("volume"),
                         "issue": pmidData.get("issue"),
                         "page": pmidData.get("page"),
                         "status": "published"
                     }
                     if extraData.get("data_used"):
                         post_dict["data_used"] = extraData.get("data_used")
                     encodedcc.new_ENCODE(connection, "publications", post_dict)
def excel_reader(datafile, sheet, update, connection, patchall):
    row = reader(datafile, sheetname=sheet)
    keys = next(row)  # grab the first row of headers
    total = 0
    error = 0
    success = 0
    patch = 0
    for values in row:
        total += 1
        post_json = dict(zip(keys, values))
        post_json = dict_patcher(post_json)
        # add attchments here
        if post_json.get("attachment"):
            attach = attachment(post_json["attachment"])
            post_json["attachment"] = attach
        print(post_json)
        temp = {}
        if post_json.get("uuid"):
            temp = encodedcc.get_ENCODE(post_json["uuid"], connection)
        elif post_json.get("aliases"):
            temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]),
                                        connection)
        elif post_json.get("accession"):
            temp = encodedcc.get_ENCODE(post_json["accession"], connection)
        elif post_json.get("@id"):
            temp = encodedcc.get_ENCODE(post_json["@id"], connection)
        if temp.get("uuid"):
            if patchall:
                e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json)
                if e["status"] == "error":
                    error += 1
                elif e["status"] == "success":
                    success += 1
                    patch += 1
            else:
                print("Object {} already exists.  Would you like to patch it instead?".format(temp["uuid"]))
                i = input("PATCH? y/n ")
                if i.lower() == "y":
                    e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json)
                    if e["status"] == "error":
                        error += 1
                    elif e["status"] == "success":
                        success += 1
                        patch += 1
        else:
            if update:
                print("POSTing data!")
                e = encodedcc.new_ENCODE(connection, sheet, post_json)
                if e["status"] == "error":
                    error += 1
                elif e["status"] == "success":
                    success += 1
    print("{sheet}: {success} out of {total} posted, {error} errors, {patch} patched".format(
        sheet=sheet.upper(), success=success, total=total, error=error, patch=patch))
예제 #10
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    query = "/search/?type=Experiment&lab.title=Brenton+Graveley%2C+UConn&award.project=ENCODE&status=released&files.file_type=bam"
    data = encodedcc.get_ENCODE(query, connection).get("@graph", [])
    headers = ["File Accession", "Download", "Annotation", "Cell Line", "Assembly",  "Target", "Experiment Accession", "Experiment Aliases",
               "Control Experiment", "Biosample Accession", "Biosample Aliases", "Library Accession", "Library Aliases", "Lab", "Submitted Name"]
    with open("output.txt", "w") as tsvfile:
        writer = csv.DictWriter(tsvfile, fieldnames=headers, delimiter="\t")
        writer.writeheader()
        for exp in data:
            if exp.get("possible_controls"):
                print("Experiment", exp.get("accession"))
                temp = dict.fromkeys(headers)
                temp["Experiment Accession"] = exp.get("accession")
                temp["Experiment Aliases"] = exp.get("aliases")
                temp["Cell Line"] = exp.get("biosample_term_name")
                temp["Target"] = exp.get("target")
                temp["Control Experiment"] = exp["possible_controls"]
                if exp.get("files"):
                    files = exp["files"]
                else:
                    files = exp["original_files"]
                for f in files:
                    file = encodedcc.get_ENCODE(f, connection)
                    if file.get("file_format", "") == "bam":
                        # this is a bam file and we want it
                        temp["Lab"] = file.get("lab")
                        temp["Annotation"] = file.get("genome_annotation")
                        temp["File Accession"] = file.get("accession")
                        temp["Submitted Name"] = file.get("submitted_file_name")
                        temp["Download"] = connection.server + "/files/" + file["accession"] + "/@@download/" + file["accession"] + ".bam"
                        temp["Assembly"] = file.get("assembly")
                        print("File", file.get("accession"))
                        if file.get("replicate"):
                            rep = encodedcc.get_ENCODE(file["replicate"], connection)
                            if rep.get("library"):
                                lib = encodedcc.get_ENCODE(rep["library"], connection)
                                temp["Library Accession"] = lib.get("accession")
                                temp["Library Aliases"] = lib.get("aliases")
                                print("Library", lib.get("accession"))
                                if lib.get("biosample"):
                                    bio = encodedcc.get_ENCODE(lib["biosample"], connection)
                                    temp["Biosample Accession"] = bio.get("accession")
                                    temp["Biosample Aliases"] = bio.get("aliases")
                                    print("Biosample", bio.get("accession"))
                        writer.writerow(temp)
예제 #11
0
    def find_ENCODE_extras(self, communityList, consortiumList, connection):
        '''finds any publications in the ENCODE database
        that are not in the files provided
        '''
        community_url = "/search/?type=publication&status=published\
                        &published_by=community&field=identifiers&limit=all"

        consortium_url = "/search/?type=publication&status=published\
                        &published_by!=community&field=identifiers&limit=all"

        communityResult = encodedcc.get_ENCODE(community_url,
                                               connection).get("@graph")
        consortiumResult = encodedcc.get_ENCODE(consortium_url,
                                                connection).get("@graph")
        communityPMIDfromENCODE = []  # list of PMID from ENCODE site
        communityOtherID = []  # list of non-PMID ids from ENCODE site
        for pub in communityResult:
            temp = pub.get("identifiers", [])
            for idNum in temp:
                if "PMID:" in idNum:
                    communityPMIDfromENCODE.append(idNum)
                    # this is something that has a pubmed ID
                elif "PMCID:PMC" in idNum:
                    pass
                    # this is an alternate PMID
                else:
                    uuid = pub.get("@id")
                    communityOtherID.append(uuid)
                    # this is something that does not have a PMID yet, find it and PATCH it in
        community_ENCODE_Only = list(
            set(communityPMIDfromENCODE) - set(communityList))
        consortiumPMIDfromENCODE = []  # list of PMID from ENCODE site
        consortiumOtherID = []  # list of non-PMID ids from ENCODE site
        for pub in consortiumResult:
            temp = pub.get("identifiers", [])
            for idNum in temp:
                if "PMID:" in idNum:
                    consortiumPMIDfromENCODE.append(idNum)
                    # this is something that has a pubmed ID
                elif "PMCID:PMC" in idNum:
                    pass
                    # this is an alternate PMID
                else:
                    uuid = pub.get("@id")
                    consortiumOtherID.append(uuid)
                    # this is something that does not have a PMID yet, find it and PATCH it in
        consortium_ENCODE_Only = list(
            set(consortiumPMIDfromENCODE) - set(consortiumList))
        return community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    data = encodedcc.get_ENCODE(args.query, connection).get("@graph")
    print("Experiment\tStatus\tControl\tStatus")
    for exp in data:
        if exp.get("possible_controls"):
            if exp["status"] != "released":
                c = exp["possible_controls"][0]
                control = encodedcc.get_ENCODE(c, connection)
                if control["status"] == "released":
                    print("{}\t{}\t{}\t{}".format(exp["accession"], exp["status"], control["accession"], control["status"]))
    '''
예제 #13
0
    def run_script(self):
        # set_up() gets all the command line arguments and validates them
        # also makes the list of accessions to run from
        self.set_up()

        good = ["released", "current", "disabled", "published", "finished", "virtual"]
        bad = ["replaced", "revoked", "deleted", "upload failed", "archived",
               "format check failed", "uploading", "error"]
        ignore = ["User", "AntibodyCharacterization", "Publication", "ReferenceEpigenome"]
        for accession in self.ACCESSIONS:
            self.searched = []
            expandedDict = encodedcc.get_ENCODE(accession, self.connection)
            objectStatus = expandedDict.get("status")
            obj = expandedDict["@type"][0]

            audit = encodedcc.get_ENCODE(accession, self.connection, "page").get("audit", {})
            passAudit = True
            logger.info('%s' % "{}: {} Status: {}".format(obj, accession, objectStatus))
            if audit.get("ERROR", ""):
                logger.warning('%s' % "WARNING: Audit status: ERROR")
                passAudit = False
            if audit.get("NOT_COMPLIANT", ""):
                logger.warning('%s' % "WARNING: Audit status: NOT COMPLIANT")
                passAudit = False
            self.statusDict = {}
            self.get_status(expandedDict)
            if self.FORCE:
                passAudit = True

            named = []
            for key in sorted(self.statusDict.keys()):
                name = self.statusDict[key][0]
                status = self.statusDict[key][1]
                if name not in ignore:
                    if name not in named:
                        logger.info('%s' % name.upper())
                    if status in good:
                        if self.LOGALL:
                            logger.info('%s' % "{} has status {}".format(key, status))
                    elif status in bad:
                        logger.warning('%s' % "WARNING: {} has status {}".format(key, status))
                    else:
                        logger.info('%s' % "{} has status {}".format(key, status))
                        if self.UPDATE:
                            if passAudit:
                                self.releasinator(name, key, status)
                    named.append(name)
        print("Data written to file", self.outfile)
def main():

        parser = argparse.ArgumentParser(
            description=__doc__, epilog=EPILOG,
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
        parser.add_argument('--infile', '-i', default='obList', help="File containing a list of ENCSRs.")
        parser.add_argument('--search', default='NULL', help="The search parameters.")
        parser.add_argument('--key', default='default', help="The keypair identifier from the keyfile.  Default is --key=default")
        parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file.  Default is --keyfile=%s" %(os.path.expanduser("~/keypairs.json")))
        parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages.  Default is False.")
        parser.add_argument('--field', default='accession', help="The field to report.  Default is accession.")
        args = parser.parse_args()

        DEBUG_ON =args.debug

        myKey = encodedcc.ENC_Key(args.keyfile, args.key)
        myConnect = encodedcc.ENC_Connection(myKey)

        #Get list of objects we are interested in

        objList = get_experiment_list (args.infile, args.search, myConnect)
        for i in range (0, len(objList)):
           
           field = ''     
           if objList[i] != '':
               ob = encodedcc.get_ENCODE(objList[i], myConnect)
               id = ob.get('@id')
               if args.field in ob:
                   field = str(ob[args.field])
           else:
              id = objList[i]
           print ('\t'.join([id,field]))
예제 #15
0
def get_char_summary(lot, connection):

    anti = encodedcc.get_ENCODE(lot, connection, frame="embedded")
    charas = anti.get("characterizations", [])
    number_chars_in_progress = 0
    number_chars_passing = 0
    number_chars_failing = 0
    for c in charas:
        s = c["status"]
        if s == "in progress":
            number_chars_in_progress += 1
        elif s == "pending dcc review":
            number_chars_in_progress += 1
        elif s == 'exempt from standards':
            number_chars_passing += 1
        elif s == "compliant":
            number_chars_passing += 1
        else:
            number_chars_failing += 1
    char_dict = {
        "number_chars_in_progress": number_chars_in_progress,
        "number_chars_passing": number_chars_passing,
        "number_chars_failing": number_chars_failing
    }
    return char_dict
 def single_rep(self, obj):
     '''one control with one replicate in control,
     multiple replicates in experiment'''
     control_files = encodedcc.get_ENCODE(
         obj["possible_controls"][0]["accession"],
         self.connection,
         frame="embedded").get("files", [])
     if len(control_files) == 0:
         if self.DEBUG:
             print("Control object {} has no files".format(
                 obj["possible_controls"][0]["accession"]),
                   file=sys.stderr)
         return
     for c in control_files:
         if c.get("file_type", "") == "fastq":
             exp_list = []
             for e in obj["files"]:
                 if e.get("file_type", "") == "fastq":
                     if not self.MISSING or (self.MISSING and
                                             not e.get("controlled_by")):
                         exp_list.append(e["accession"])
             for exp in exp_list:
                 temp = {
                     "ExpAcc": obj["accession"],
                     "Method": "Single",
                     "ExpFile": exp,
                     "ConFile": c["accession"]
                 }
                 self.dataList.append(temp)
                 if self.update:
                     self.updater(exp, c["accession"])
                 if self.DEBUG:
                     print("ExpFile: {}, ConFile: {}".format(
                         temp["ExpFile"], temp["ConFile"]))
    def multi_control(self, obj):
        '''multiple controls, match on biosample'''
        con_data = {}
        val = True
        for con in obj["possible_controls"]:
            c = encodedcc.get_ENCODE(con["accession"],
                                     self.connection,
                                     frame="embedded")
            if c.get("replicates"):
                for rep in c["replicates"]:
                    if c.get("files"):
                        con_bio_acc = rep["library"]["biosample"]["accession"]
                        con_bio_num = rep["biological_replicate_number"]
                        for f in c["files"]:
                            if f.get("file_type", "") == "fastq":
                                con_file_bio_num = f["biological_replicates"]
                                if con_bio_num in con_file_bio_num:
                                    con_file_acc = f["accession"]
                                    con_data[con_bio_acc] = con_file_acc
                    else:
                        if self.DEBUG:
                            print("No files found for control {}".format(
                                con["accession"]),
                                  file=sys.stderr)
                        val = False
            else:
                if self.DEBUG:
                    print("No replicates found in control {}".format(
                        con["accession"]),
                          file=sys.stderr)
                val = False

        if val:
            exp_data = {}
            for e in obj["replicates"]:
                exp_bio_acc = e["library"]["biosample"]["accession"]
                exp_bio_num = e["biological_replicate_number"]
                for f in obj["files"]:
                    if f.get("file_type", "") == "fastq":
                        if not self.MISSING or (self.MISSING and
                                                not f.get("controlled_by")):
                            exp_file_bio_num = f["biological_replicates"]
                            if exp_bio_num in exp_file_bio_num:
                                exp_file_acc = f["accession"]
                                exp_data[exp_bio_acc] = exp_file_acc

            for key in exp_data.keys():
                if con_data.get(key):
                    temp = {
                        "ExpAcc": obj["accession"],
                        "Method": "Biosample",
                        "ExpFile": exp_data[key],
                        "ConFile": con_data[key]
                    }
                    self.dataList.append(temp)
                    if self.update:
                        self.updater(exp_data[key], con_data[key])
                    if self.DEBUG:
                        print("Biosample: {}, ExpFile: {}, ConFile: {}".format(
                            key, temp["ExpFile"], temp["ConFile"]))
def get_antibody_approval (antibody, target):
        myConnect = connection
        search = encodedcc.get_ENCODE('search/?searchTerm='+antibody+'&type=antibody_approval', myConnect)
        for approval in search['@graph']:
            if approval['target']['name'] == target:
                return approval['status']
        return "UNKNOWN"  
    def multi_rep(self, obj):
        '''one control, with one replicate in
        control per replicate in experiment'''
        control_files = encodedcc.get_ENCODE(obj["possible_controls"][0]["accession"], self.connection, frame="embedded").get("files", [])
        control_replicates = obj["possible_controls"][0].get("replicates", [])
        exp_data = {}
        con_data = {}
        if len(control_replicates) != len(obj["replicates"]):
            if self.DEBUG:
                print("Control has {} replicates and experiment has {} replicates".format(len(control_replicates), len(obj["replicates"])), file=sys.stderr)
            return
        if len(control_files) == 0:
            if self.DEBUG:
                print("Control {} has no files".format(obj["possible_controls"][0]["accession"]), file=sys.stderr)
            return
        for e in obj["files"]:
            if e.get("file_type", "") == "fastq":
                if not self.MISSING or (self.MISSING and not e.get("controlled_by")):
                    self.pair_dict_maker(exp_data, e)
        for c in control_files:
            if c.get("file_type", "") == "fastq":
                self.pair_dict_maker(con_data, c)

        if self.ignore_runtype:
            self.mini(exp_data, con_data, obj)
        else:
            self.mini(con_data, exp_data, obj)
 def single_rep(self, obj):
     '''one control with one replicate in control,
     multiple replicates in experiment'''
     control_files = encodedcc.get_ENCODE(
         obj["possible_controls"][0]["accession"],
         self.connection,
         frame="embedded").get("files", [])
     if len(control_files) == 0:
         if self.DEBUG:
             print(
                 "Control object {} has no files".format(
                     obj["possible_controls"][0]["accession"]),
                 file=sys.stderr)
         return
     for c in control_files:
         if c.get("file_type", "") == "fastq":
             exp_list = []
             for e in obj["files"]:
                 if e.get("file_type", "") == "fastq":
                     if not self.MISSING or (self.MISSING and
                                             not e.get("controlled_by")):
                         exp_list.append(e["accession"])
             temp = {
                 "Exp Accession": obj["accession"],
                 "Check type": "Single",
                 "Experiment": exp_list,
                 "Control": c["accession"]
             }
             if len(exp_list) > 0:
                 self.data.append(temp)
             if self.DEBUG:
                 print("experiment files {}".format(temp["Experiment"]))
                 print("control files {}".format(temp["Control"]))
    def multi_rep(self, obj, ignore_runtype=False):
        '''one control, with one replicate in
        control per replicate in experiment'''
        control_files = encodedcc.get_ENCODE(obj["possible_controls"][0]["accession"], self.connection, frame="embedded").get("files", [])
        control_replicates = obj["possible_controls"][0].get("replicates", [])
        exp_data = {}
        con_data = {}
        if len(control_replicates) != len(obj["replicates"]):
            if self.DEBUG:
                print("Control has {} replicates and experiment has {} replicates".format(len(control_replicates), len(obj["replicates"])), file=sys.stderr)
            return
        if len(control_files) == 0:
            if self.DEBUG:
                print("Control {} has no files".format(obj["possible_controls"][0]["accession"]), file=sys.stderr)
            return
        for e in obj["files"]:
            if e.get("file_type", "") == "fastq":
                if not self.MISSING or (self.MISSING and not e.get("controlled_by")):
                    exp_file_bio_num = e.get("biological_replicates")
                    exp_file_paired = e.get("paired_end")
                    exp_file_acc = e["accession"]
                    if ignore_runtype:
                        exp_file_paired = None
                    exp_pair = str(exp_file_bio_num[0]) + "-" + str(exp_file_paired)
                    exp_data[exp_file_acc] = exp_pair

        for c in control_files:
            if c.get("file_type", "") == "fastq":
                con_file_bio_num = c.get("biological_replicates")
                con_file_paired = c.get("paired_end")
                con_file_acc = c["accession"]
                if ignore_runtype:
                    con_file_paired = None
                con_pair = str(con_file_bio_num[0]) + "-" + str(con_file_paired)
                con_data[con_file_acc] = con_pair

        if ignore_runtype:
            for e_key in exp_data.keys():
                con_list = []
                for c_key in con_data.keys():
                    if exp_data[e_key] == con_data[c_key]:
                        con_list.append(c_key)
                temp = {"Exp Accession": obj["accession"], "Check type": "Multi-runtype ignored", "Experiment": e_key, "Control": con_list}
                self.data.append(temp)
                if self.DEBUG:
                    print("experiment files", e_key)
                    print("control files", con_list)
        else:
            for c_key in con_data.keys():
                exp_list = []
                for e_key in exp_data.keys():
                    if con_data[c_key] == exp_data[e_key]:
                        exp_list.append(e_key)
                temp = {"Exp Accession": obj["accession"], "Check type": "Multi", "Experiment": exp_list, "Control": c_key}
                if len(exp_list) > 0:
                    self.data.append(temp)
                if self.DEBUG:
                    print("experiment files", exp_list)
                    print("control files", c_key)
예제 #22
0
def get_experiment_list(path, search, connection):
    if search == "NULL":
        with open(path) as f:
            experiment_list = [line.strip() for line in f.readlines()]
    else:
        results = encodedcc.get_ENCODE(search, connection, frame='embedded')
        experiment_list = [r['accession'] for r in results['@graph']]
    return experiment_list
    def multi_control(self, obj):
        '''multiple controls, match on biosample'''
        con_data = {}
        val = True
        for con in obj["possible_controls"]:
            c = encodedcc.get_ENCODE(
                con["accession"], self.connection, frame="embedded")
            if c.get("replicates"):
                for rep in c["replicates"]:
                    if c.get("files"):
                        con_bio_acc = rep["library"]["biosample"]["accession"]
                        con_bio_num = rep["biological_replicate_number"]
                        for f in c["files"]:
                            if f.get("file_type", "") == "fastq":
                                con_file_bio_num = f["biological_replicates"]
                                if con_bio_num in con_file_bio_num:
                                    con_file_acc = f["accession"]
                                    con_data[con_bio_acc] = con_file_acc
                    else:
                        if self.DEBUG:
                            print(
                                "No files found for control {}".format(
                                    con["accession"]),
                                file=sys.stderr)
                        val = False
            else:
                if self.DEBUG:
                    print(
                        "No replicates found in control {}".format(
                            con["accession"]),
                        file=sys.stderr)
                val = False

        if val:
            exp_data = {}
            for e in obj["replicates"]:
                exp_bio_acc = e["library"]["biosample"]["accession"]
                exp_bio_num = e["biological_replicate_number"]
                for f in obj["files"]:
                    if f.get("file_type", "") == "fastq":
                        if not self.MISSING or (self.MISSING and
                                                not f.get("controlled_by")):
                            exp_file_bio_num = f["biological_replicates"]
                            if exp_bio_num in exp_file_bio_num:
                                exp_file_acc = f["accession"]
                                exp_data[exp_bio_acc] = exp_file_acc

            for key in exp_data.keys():
                if con_data.get(key):
                    temp = {
                        "Exp Accession": obj["accession"],
                        "Check type": "Biosample",
                        "Experiment": exp_data[key],
                        "Control": con_data[key]
                    }
                    self.data.append(temp)
                    if self.DEBUG:
                        print("Biosample {}: files {}".format(key, temp))
예제 #24
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    data = encodedcc.get_ENCODE(args.query, connection).get("@graph")
    print("Experiment\tStatus\tControl\tStatus")
    for exp in data:
        if exp.get("possible_controls"):
            if exp["status"] != "released":
                c = exp["possible_controls"][0]
                control = encodedcc.get_ENCODE(c, connection)
                if control["status"] == "released":
                    print("{}\t{}\t{}\t{}".format(exp["accession"],
                                                  exp["status"],
                                                  control["accession"],
                                                  control["status"]))
    '''
예제 #25
0
def get_antibody_approval(antibody, target):
    myConnect = connection
    search = encodedcc.get_ENCODE(
        'search/?searchTerm=' + antibody + '&type=antibody_approval',
        myConnect)
    for approval in search['@graph']:
        if approval['target']['name'] == target:
            return approval['status']
    return "UNKNOWN"
예제 #26
0
def get_antibody_approval(antibody, target, connection):
    search = encodedcc.get_ENCODE('search/?searchTerm=' + antibody +
                                  '&type=antibody_approval',
                                  connection,
                                  frame='embedded')
    for approval in search['@graph']:
        if approval['target']['name'] == target:
            return approval['status']
    return "UNKNOWN"
예제 #27
0
    def __init__(self, args, connection):
        # renaming some things so I can be lazy and not pass them around
        self.infile = args.infile
        self.outfile = args.outfile
        self.QUERY = args.query
        self.LOGALL = args.logall
        self.FORCE = args.force
        self.UPDATE = args.update
        self.keysLink = []
        self.PROFILES = {}
        self.ACCESSIONS = []
        self.statusDict = {}
        self.connection = connection
        temp = encodedcc.get_ENCODE("/profiles/", self.connection)
        ignore = ["Lab", "Award", "AntibodyCharacterization", "Platform",
                  "Publication", "Organism", "Reference", "AccessKey", "User", "Target"]
        self.profilesJSON = []
        self.dontExpand = []
        self.date_released = []
        for profile in temp.keys():
            # get the names of things we DON'T expand
            # these things usually link to other experiments/objects
            if "AnalysisStep" in profile:
                self.dontExpand.append(self.helper(profile))
            elif "QualityMetric" in profile:
                self.dontExpand.append(self.helper(profile))
            elif "Donor" in profile:
                self.dontExpand.append(self.helper(profile))
            elif profile in ignore:
                pass
            else:
                self.profilesJSON.append(profile)
        self.profiles_ref = []
        #print(self.dontExpand)
        for profile in self.profilesJSON:
            #print(profile)
            self.profiles_ref.append(self.helper(profile))

        for item in self.profilesJSON:
            profile = temp[item]
            self.keysLink = []  # if a key is in this list, it points to a link and will be embedded in the final product
            self.make_profile(profile)
            self.PROFILES[item] = self.keysLink
            # lets get the list of things that actually get a date released
            for value in profile["properties"].keys():
                if value == "date_released":
                    self.date_released.append(item)
        #print(self.date_released)

        self.current = []
        self.finished = []
        for item in temp.keys():
            status = temp[item]["properties"]["status"]["enum"]
            if "current" in status:
                self.current.append(item)
            if "finished" in status:
                self.finished.append(item)
예제 #28
0
    def set_up(self):
        '''do some setup for script'''

        if self.UPDATE:
            print("WARNING: This run is an " +
                  "UPDATE run objects will be released.")
        else:
            print("Object status will be checked but not changed")
        if self.FORCE:
            print("WARNING: Objects that do not " +
                  "pass audit will be FORCE-released")
        if self.HELA:
            print(
                'WARNING: Objects associated with HeLa data will be released')
        if self.LOGALL:
            print("Logging all statuses")
        if self.infile:
            if os.path.isfile(self.infile):
                self.ACCESSIONS = [
                    line.rstrip('\n') for line in open(self.infile)
                ]
            else:
                self.ACCESSIONS = self.infile.split(",")
        elif self.QUERY:
            if "search" in self.QUERY:
                temp = encodedcc.get_ENCODE(self.QUERY,
                                            self.connection).get("@graph", [])
            else:
                temp = [encodedcc.get_ENCODE(self.QUERY, self.connection)]
            if any(temp):
                for obj in temp:
                    if obj.get("accession"):
                        self.ACCESSIONS.append(obj["accession"])
                    elif obj.get("uuid"):
                        self.ACCESSIONS.append(obj["uuid"])
                    elif obj.get("@id"):
                        self.ACCESSIONS.append(obj["@id"])
                    elif obj.get("aliases"):
                        self.ACCESSIONS.append(obj["aliases"][0])
        if len(self.ACCESSIONS) == 0:
            # if something happens and we end up with no accessions stop
            print("ERROR: object has no identifier", file=sys.stderr)
            sys.exit(1)
 def updater(self, exp, con):
     ''' helper function runs the update step'''
     temp = encodedcc.get_ENCODE(exp, self.connection).get("controlled_by", [])
     if con not in temp:
         control = temp + [con]
         patch_dict = {"controlled_by": control}
         print("patching experiment file {} with controlled_by {}".format(exp, con))
         encodedcc.patch_ENCODE(exp, self.connection, patch_dict)
     else:
         print("ERROR: controlled_by for experiment file {} already contains {}".format(exp, con))
예제 #30
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.query:
        temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", [])
        for obj in temp:
            accessions.append(obj.get("@id"))
    elif args.infile:
        accessions = [line.strip() for line in open(args.infile)]
    elif args.accession:
        accessions = [args.accession]
    else:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    for acc in accessions:
        encodedcc.get_ENCODE(acc, connection)
def main():
    headers = [
        "accession", "description", "organism", "age_display", "life_stage",
        "sex", "biosample_term_name", "biosample_type",
        "depleted_in_term_name", "phase", "subcellular_fraction_term_name",
        "post_synchronization_time", "post_synchronization_time_units",
        "synchronization", "model_organism_mating_status", "treatments",
        "donor", "transfection_type", "talens", "constructs",
        "model_organism_donor_constructs", "rnais", "part_of", "pooled_from",
        "derived_from", "status", "culture_harvest_date", "culture_start_date",
        "date_obtained", "lab", "source", "note", "notes", "health_status",
        "starting_amount", "starting_amount_units"
    ]
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.query:
        temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", [])
        for obj in temp:
            accessions.append(obj.get("accession"))
    elif args.infile:
        accessions = [line.strip() for line in open(args.infile)]
    elif args.accession:
        accessions = [args.accession]
    else:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    data = []
    for acc in accessions:
        temp = {}
        obj = encodedcc.get_ENCODE(acc, connection)
        for h in headers:
            x = obj.get(h, "")
            if any(x):
                temp[h] = x
            else:
                temp[h] = ""
        data.append(temp)
    writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=headers)
    writer.writeheader()
    for d in data:
        writer.writerow(d)
 def find_ENCODE_extras(self, communityList, consortiumList, connection):
     '''finds any publications in the ENCODE database
     that are not in the files provided
     '''
     community_url = "/search/?type=publication&status=published\
                     &published_by=community&field=identifiers&limit=all"
     consortium_url = "/search/?type=publication&status=published\
                     &published_by!=community&field=identifiers&limit=all"
     communityResult = encodedcc.get_ENCODE(community_url, connection).get("@graph")
     consortiumResult = encodedcc.get_ENCODE(consortium_url, connection).get("@graph")
     communityPMIDfromENCODE = []  # list of PMID from ENCODE site
     communityOtherID = []  # list of non-PMID ids from ENCODE site
     for pub in communityResult:
         temp = pub.get("identifiers", [])
         for idNum in temp:
             if "PMID:" in idNum:
                 communityPMIDfromENCODE.append(idNum)
                 # this is something that has a pubmed ID
             elif "PMCID:PMC" in idNum:
                 pass
                 # this is an alternate PMID
             else:
                 uuid = pub.get("@id")
                 communityOtherID.append(uuid)
                 # this is something that does not have a PMID yet, find it and PATCH it in
     community_ENCODE_Only = list(set(communityPMIDfromENCODE) - set(communityList))
     consortiumPMIDfromENCODE = []  # list of PMID from ENCODE site
     consortiumOtherID = []  # list of non-PMID ids from ENCODE site
     for pub in consortiumResult:
         temp = pub.get("identifiers", [])
         for idNum in temp:
             if "PMID:" in idNum:
                 consortiumPMIDfromENCODE.append(idNum)
                 # this is something that has a pubmed ID
             elif "PMCID:PMC" in idNum:
                 pass
                 # this is an alternate PMID
             else:
                 uuid = pub.get("@id")
                 consortiumOtherID.append(uuid)
                 # this is something that does not have a PMID yet, find it and PATCH it in
     consortium_ENCODE_Only = list(set(consortiumPMIDfromENCODE) - set(consortiumList))
     return community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID
def main():
    headers = ["accession", "description", "organism", "age_display",
               "life_stage", "sex", "biosample_term_name", "biosample_type",
               "depleted_in_term_name", "phase",
               "subcellular_fraction_term_name", "post_synchronization_time",
               "post_synchronization_time_units", "synchronization",
               "model_organism_mating_status", "treatments", "donor",
               "transfection_type", "talens", "constructs",
               "model_organism_donor_constructs", "rnais", "part_of",
               "pooled_from", "derived_from", "status", "culture_harvest_date",
               "culture_start_date", "date_obtained", "lab", "source", "note",
               "notes", "health_status", "starting_amount",
               "starting_amount_units"]
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.query:
        temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", [])
        for obj in temp:
            accessions.append(obj.get("accession"))
    elif args.infile:
        accessions = [line.strip() for line in open(args.infile)]
    elif args.accession:
        accessions = [args.accession]
    else:
        print("No accessions to check!", file=sys.stderr)
        sys.exit(1)
    data = []
    for acc in accessions:
        temp = {}
        obj = encodedcc.get_ENCODE(acc, connection)
        for h in headers:
            x = obj.get(h, "")
            if any(x):
                temp[h] = x
            else:
                temp[h] = ""
        data.append(temp)
    writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=headers)
    writer.writeheader()
    for d in data:
        writer.writerow(d)
def file_manager(key, value, connection, obj_type):
    filename = key.split("/")[-1]
    print("Downloading {}".format(filename))
    r = requests.get(key)
    with open(filename, "wb") as outfile:
        outfile.write(r.content)
    if obj_type == "Biosample":
        filepart = filename.split("-")[0]
    else:
        filepart = filename.split("-")[1]

    attach = attachment(filename)
    temp = "_".join(key.split("/")[-2:])
    aliases = ["brenton-graveley:" + temp]

    if (encodedcc.get_ENCODE(quote(aliases[0]),
                             connection)['status']) != 'error':

        removing_patch = {'status': 'deleted', 'aliases': []}
        print('DELETING ' + aliases[0] + ' ' + str(removing_patch))
        encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch)

    upload = {
        "aliases":
        aliases,
        "attachment":
        attach,
        "award":
        "U54HG007005",
        "document_type":
        "general protocol",
        "lab":
        "/labs/brenton-graveley/",
        "status":
        "released",
        "description":
        "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format(
            obj_type=obj_type, filepart=filepart),
    }

    print("Uploading {} as {}".format(filename, aliases[0]))

    encodedcc.new_ENCODE(connection, "Document", upload)

    print("Patching {} with document {}".format(value, aliases[0]))
    if obj_type == "Biosample":
        docs = {"protocol_documents": aliases}
    else:
        docs = {"documents": aliases}

    encodedcc.patch_ENCODE(quote(value), connection, docs)

    print("Removing document {}".format(filename))
    subprocess.run(["rm", filename])
    '''
예제 #35
0
 def has_audit(self, accession):
     # Another GET request for page frame.
     audit = encodedcc.get_ENCODE(accession, self.connection,
                                  'page').get('audit', {})
     if (audit.get('ERROR') is not None
             or audit.get('NOT_COMPLIANT') is not None):
         details = [v[0]['category'] for v in audit.values()]
         message = 'WARNING: AUDIT on object: {}. SKIPPING!'.format(details)
         print(message)
         logger.warning(message)
         return True
     return False
 def get_status(self, obj):
     """take object get status, @type, @id, uuid
     {@id : [@type, status]}"""
     name = obj["@type"][0]
     self.searched.append(obj["@id"])
     if self.PROFILES.get(name):
         self.statusDict[obj["@id"]] = [name, obj["status"]]
         for key in obj.keys():
             # loop through object properties
             if key in self.PROFILES[name]:
                 # if the key is in profiles it's a link
                 if type(obj[key]) is list:
                     for link in obj[key]:
                         item = link.split("/")[1].replace("-", "")
                         if item in self.profiles_ref and link not in self.searched:
                             # expand subobject
                             subobj = encodedcc.get_ENCODE(link, self.connection)
                             self.get_status(subobj)
                         else:
                             if item in self.dontExpand and link not in self.searched:
                                 # this is not one of the links we expand
                                 # is it a link we just get status of
                                 tempobj = encodedcc.get_ENCODE(link, self.connection)
                                 tempname = tempobj["@type"][0]
                                 self.searched.append(tempobj["@id"])
                                 self.statusDict[tempobj["@id"]] = [tempname, tempobj["status"]]
                 else:
                     item = obj[key].split("/")[1].replace("-", "")
                     if item in self.profiles_ref and obj[key] not in self.searched:
                         # expand subobject
                         subobj = encodedcc.get_ENCODE(obj[key], self.connection)
                         self.get_status(subobj)
                     else:
                         if item in self.dontExpand and obj[key] not in self.searched:
                             # this is not one of the links we expand
                             # is it a link we just get status of
                             tempobj = encodedcc.get_ENCODE(obj[key], self.connection)
                             tempname = tempobj["@type"][0]
                             self.searched.append(tempobj["@id"])
                             self.statusDict[tempobj["@id"]] = [tempname, tempobj["status"]]
예제 #37
0
def get_experiment_list(file, search, connection):
        objList = []
        if search == "NULL":
            f = open(file)
            objList = f.readlines()
            for i in range(0, len(objList)):
                objList[i] = objList[i].strip()
        else:
            set = encodedcc.get_ENCODE(search + '&limit=all', connection, frame='embedded')
            for i in range(0, len(set['@graph'])):
                objList.append(set['@graph'][i]['accession'])

        return objList
예제 #38
0
def get_experiment_list(file, search, connection):
        objList = []
        if search == "NULL":
            f = open(file)
            objList = f.readlines()
            for i in range(0, len(objList)):
                objList[i] = objList[i].strip()
        else:
            set = encodedcc.get_ENCODE(search + '&limit=all', connection, frame='embedded')
            for i in range(0, len(set['@graph'])):
                objList.append(set['@graph'][i]['accession'])

        return objList
예제 #39
0
def main():

    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('--infile',
                        '-i',
                        default='obList',
                        help="File containing a list of ENCSRs.")
    parser.add_argument('--search',
                        default='NULL',
                        help="The search parameters.")
    parser.add_argument(
        '--key',
        default='default',
        help=
        "The keypair identifier from the keyfile.  Default is --key=default")
    parser.add_argument('--keyfile',
                        default=os.path.expanduser("~/keypairs.json"),
                        help="The keypair file.  Default is --keyfile=%s" %
                        (os.path.expanduser("~/keypairs.json")))
    parser.add_argument('--debug',
                        default=False,
                        action='store_true',
                        help="Print debug messages.  Default is False.")
    parser.add_argument('--field',
                        default='accession',
                        help="The field to report.  Default is accession.")
    args = parser.parse_args()

    DEBUG_ON = args.debug

    myKey = encodedcc.ENC_Key(args.keyfile, args.key)
    myConnect = encodedcc.ENC_Connection(myKey)

    #Get list of objects we are interested in

    objList = get_experiment_list(args.infile, args.search, myConnect)
    for i in range(0, len(objList)):

        field = ''
        if objList[i] != '':
            ob = encodedcc.get_ENCODE(objList[i], myConnect)
            id = ob.get('@id')
            if args.field in ob:
                field = str(ob[args.field])
        else:
            id = objList[i]
        print('\t'.join([id, field]))
예제 #40
0
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.infile:
        if os.path.isfile(args.infile):
            accessions = [line.strip() for line in open(args.infile)]
        else:
            accessions = args.infile.split(",")
    elif args.query:
        data = []
        if "search" in args.query:
            data = encodedcc.get_ENCODE(args.query,
                                        connection).get("@graph", [])
        else:
            data = [encodedcc.get_ENCODE(args.query, connection)]
        for exp in data:
            files = exp.get("files", [])
            for f in files:
                res = encodedcc.get_ENCODE(f, connection)
                f_type = res.get("file_format", "")
                if f_type == "fastq":
                    accessions.append(res["accession"])
    else:
        print("No accessions to check")
        sys.exit(1)
    for acc in accessions:
        link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz"
        for header, sequence, qual_header, quality in encodedcc.fastq_read(
                connection, uri=link):
            if args.header:
                header = header.decode("UTF-8")
                print(acc + "\t" + str(len(sequence)) + "\t" + header)
            else:
                sequence = sequence.decode("UTF-8")
                print(acc + "\t" + str(len(sequence)))
 def updater(self, exp, con):
     ''' helper function runs the update step'''
     temp = encodedcc.get_ENCODE(exp,
                                 self.connection).get("controlled_by", [])
     if con not in temp:
         control = temp + [con]
         patch_dict = {"controlled_by": control}
         print("patching experiment file {} with controlled_by {}".format(
             exp, con))
         encodedcc.patch_ENCODE(exp, self.connection, patch_dict)
     else:
         print(
             "ERROR: controlled_by for experiment file {} already contains {}"
             .format(exp, con))
def file_manager(key, value, connection, obj_type):
    filename = key.split("/")[-1]
    print("Downloading {}".format(filename))
    r = requests.get(key)
    with open(filename, "wb") as outfile:
        outfile.write(r.content)
    if obj_type == "Biosample":
        filepart = filename.split("-")[0]
    else:
        filepart = filename.split("-")[1]

    attach = attachment(filename)
    temp = "_".join(key.split("/")[-2:])
    aliases = ["brenton-graveley:" + temp]

    if (encodedcc.get_ENCODE(quote(aliases[0]), connection)['status']) != 'error':

        removing_patch = {'status':'deleted',
                          'aliases': []}
        print ('DELETING ' + aliases[0] + ' ' + str(removing_patch))
        encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch)


    upload = {"aliases": aliases,
              "attachment": attach,
              "award": "U54HG007005",
              "document_type": "general protocol",
              "lab": "/labs/brenton-graveley/",
              "status": "released",
              "description": "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format(obj_type=obj_type, filepart=filepart),
              }

    print("Uploading {} as {}".format(filename, aliases[0]))

    encodedcc.new_ENCODE(connection, "Document", upload)

    print("Patching {} with document {}".format(value, aliases[0]))
    if obj_type == "Biosample":
        docs = {"protocol_documents": aliases}
    else:
        docs = {"documents": aliases}
    
    encodedcc.patch_ENCODE(quote(value), connection, docs)

    print("Removing document {}".format(filename))
    subprocess.run(["rm", filename])
    
    '''
def get_experiment_list(file, search, connection):

        objList = []
        if search is None:
            f = open(file)
            objList = f.readlines()
            for i in range(0, len(objList)):
                objList[i] = objList[i].strip()
        else:
            col = get_ENCODE(search, connection, frame='page')
            for i in range(0, len(col['@graph'])):
                # print set['@graph'][i]['accession']
                objList.append(col['@graph'][i]['@id'])
                # objList.append(set['@graph'][i]['uuid'] )

        return objList
def get_experiment_list(file, search, connection):

    objList = []
    if search is None:
        f = open(file)
        objList = f.readlines()
        for i in range(0, len(objList)):
            objList[i] = objList[i].strip()
    else:
        col = get_ENCODE(search, connection, frame='page')
        for i in range(0, len(col['@graph'])):
            # print set['@graph'][i]['accession']
            objList.append(col['@graph'][i]['@id'])
            # objList.append(set['@graph'][i]['uuid'] )

    return objList
예제 #45
0
def retreive_list_of_replaced(object_to_inspect_acc, connection):
    to_return_list = [object_to_inspect_acc]
    objects_to_inspect = encodedcc.get_ENCODE(
        'search/?type=Item&accession=' + object_to_inspect_acc,
        connection)['@graph']
    if objects_to_inspect:
        for object_to_inspect in objects_to_inspect:
            if object_to_inspect.get('alternate_accessions'):
                for acc in object_to_inspect.get('alternate_accessions'):
                    to_return_list.extend(
                        retreive_list_of_replaced(acc, connection))
                return to_return_list
            else:
                return to_return_list
    else:
        return to_return_list
def make_matrix(rows, columns, headers, queries, basic_query, connection):

        matrix = {}

        for row in rows:

            matrix[row] = [row]

            for col in headers:
                query = basic_query + queries[row] + columns[col]
                res = get_ENCODE(query, connection, frame='object')
                link = connection.server + query
                total = res['total']
                func = '=HYPERLINK(' + '"' + link + '",' + repr(total) + ')'
                matrix[row].append(func)
            print ('\t'.join(matrix[row]))
        print (' ')
        print (' ')
def make_matrix(rows, columns, headers, queries, basic_query, connection):

        matrix = {}

        for row in rows:

            matrix[row] = [row]

            for col in headers:
                query = basic_query + queries[row] + columns[col]
                res = get_ENCODE(query, connection, frame='object')
                link = connection.server + query
                total = res['total']
                func = '=HYPERLINK(' + '"' + link + '",' + repr(total) + ')'
                matrix[row].append(func)
            print ('\t'.join(matrix[row]))
        print (' ')
        print (' ')
def main():

    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    if args.type:
        names = [args.type]
    else:
        book = xlrd.open_workbook(args.infile)
        names = book.sheet_names()
    profiles = encodedcc.get_ENCODE("/profiles/", connection)
    supported_collections = list(profiles.keys())
    supported_collections = [s.lower() for s in list(profiles.keys())]
    for n in names:
        if n.lower() in supported_collections:
            excel_reader(args.infile, n, args.update, connection, args.patchall)
        else:
            print("Sheet name '{}' not part of supported object types!".format(n), file=sys.stderr)
def replacer(file, connection, update):
    if file.get("aliases"):
        # this has aliases
        if file["aliases"][0].endswith("_replaced"):
            # this is one of the old ones
            alias = file["aliases"][0].rstrip("_replaced")
            old_acc = file["accession"]
            old_date = file["date_created"]
            print(old_acc)
            new = encodedcc.get_ENCODE(quote(alias), connection)
            new_acc = new["accession"]
            new_date = new["date_created"]
            patch_dict = {"status": "replaced", "alternate_accessions": [alias]}
            #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date))
            if update:
                encodedcc.patch_ENCODE(file["@id"], connection, patch_dict)
    else:
        print("file {} has no aliases".format(file["@id"]))
예제 #50
0
 def process_link(self, identifier_link, approved_types):
     # print ("entering process_link with " + identifier_link)
     #print ('Replicate' in approved_types)
     item = identifier_link.split("/")[1].replace("-", "")
     subobj = encodedcc.get_ENCODE(identifier_link, self.connection)
     subobjname = subobj["@type"][0]
     restricted_flag = False
     if (subobjname == 'File') and (self.is_restricted(subobj) is True):
         print (subobj['@id'] + ' is restricted, ' +
                'therefore will not be released')
         restricted_flag = True
     if (item in self.profiles_ref) and \
        (identifier_link not in self.searched):
         # expand subobject
         if (subobjname in approved_types) and \
            (restricted_flag is False):
             self.get_status(
                 subobj,
                 hi.dictionary_of_lower_levels.get(
                     hi.levels_mapping.get(subobjname)))
예제 #51
0
 def process_link(self, identifier_link, approved_types):
     # print ("entering process_link with " + identifier_link)
     #print ('Replicate' in approved_types)
     item = identifier_link.split("/")[1].replace("-", "")
     subobj = encodedcc.get_ENCODE(identifier_link, self.connection)
     subobjname = subobj["@type"][0]
     restricted_flag = False
     if (subobjname == 'File') and (self.is_restricted(subobj) is True):
         print(subobj['@id'] + ' is restricted, ' +
               'therefore will not be released')
         restricted_flag = True
     if (item in self.profiles_ref) and \
        (identifier_link not in self.searched):
         # expand subobject
         if (subobjname in approved_types) and \
            (restricted_flag is False):
             self.get_status(
                 subobj,
                 hi.dictionary_of_lower_levels.get(
                     hi.levels_mapping.get(subobjname)))
def replacer(file, connection, update):
    if file.get("aliases"):
        # this has aliases
        if file["aliases"][0].endswith("_replaced"):
            # this is one of the old ones
            alias = file["aliases"][0].rstrip("_replaced")
            old_acc = file["accession"]
            old_date = file["date_created"]
            print(old_acc)
            new = encodedcc.get_ENCODE(quote(alias), connection)
            new_acc = new["accession"]
            new_date = new["date_created"]
            patch_dict = {
                "status": "replaced",
                "alternate_accessions": [alias]
            }
            #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date))
            if update:
                encodedcc.patch_ENCODE(file["@id"], connection, patch_dict)
    else:
        print("file {} has no aliases".format(file["@id"]))
예제 #53
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on {server}".format(server=connection.server))
    if not os.path.isfile(args.infile):
        print("File {filename} not found!".format(filename=args.infile))
        sys.exit(1)
    if args.type:
        names = [args.type]
    else:
        book = xlrd.open_workbook(args.infile)
        names = book.sheet_names()
    profiles = encodedcc.get_ENCODE("/profiles/", connection)
    supported_collections = list(profiles.keys())
    supported_collections = [s.lower() for s in list(profiles.keys())]
    for n in names:
        if n.lower() in supported_collections:
            excel_reader(args.infile, n, args.update, connection, args.patchall)
        else:
            print("Sheet name '{name}' not part of supported object types!".format(name=n), file=sys.stderr)
예제 #54
0
 def _get_associated_term_id(self, data_type, data):
     """
     Find biosample_term_id associated with particular object.
     """
     obj_id = None
     if data_type == 'File':
         # Get biosample_term_id in file.dataset.
         obj_id = data.get('dataset')
     elif data_type == 'Replicate':
         # Get biosample_term_id in replicate.experiment.
         obj_id = data.get('experiment')
     elif data_type == 'Library':
         # Get biosample_term_id in library.biosample.
         obj_id = data.get('biosample')
     else:
         # For experiments and biosamples.
         biosample_term_id = data.get('biosample_term_id')
     if obj_id is not None:
         # Return biosample_term_id of embedded object.
         biosample_term_id = encodedcc.get_ENCODE(
             obj_id, self.connection).get('biosample_term_id')
     return biosample_term_id
    def multi_rep(self, obj):
        '''one control, with one replicate in
        control per replicate in experiment'''
        control_files = encodedcc.get_ENCODE(
            obj["possible_controls"][0]["accession"],
            self.connection,
            frame="embedded").get("files", [])
        control_replicates = obj["possible_controls"][0].get("replicates", [])
        exp_data = {}
        con_data = {}
        if len(control_replicates) != len(obj["replicates"]):
            if self.DEBUG:
                print(
                    "Control has {} replicates and experiment has {} replicates"
                    .format(len(control_replicates), len(obj["replicates"])),
                    file=sys.stderr)
            return
        if len(control_files) == 0:
            if self.DEBUG:
                print("Control {} has no files".format(
                    obj["possible_controls"][0]["accession"]),
                      file=sys.stderr)
            return
        for e in obj["files"]:
            if e.get("file_type", "") == "fastq":
                if not self.MISSING or (self.MISSING
                                        and not e.get("controlled_by")):
                    self.pair_dict_maker(exp_data, e)
        for c in control_files:
            if c.get("file_type", "") == "fastq":
                self.pair_dict_maker(con_data, c)

        if self.ignore_runtype:
            self.mini(exp_data, con_data, obj)
        else:
            self.mini(con_data, exp_data, obj)
예제 #56
0
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    print("Running on {server}".format(server=connection.server))
    if not os.path.isfile(args.infile):
        print("File {filename} not found!".format(filename=args.infile))
        sys.exit(1)
    if args.type:
        names = [args.type]
    else:
        book = xlrd.open_workbook(args.infile)
        names = book.sheet_names()
    profiles = encodedcc.get_ENCODE("/profiles/", connection)
    supported_collections = list(profiles.keys())
    supported_collections = [s.lower() for s in list(profiles.keys())]
    for n in names:
        if n.lower() in supported_collections:
            excel_reader(args.infile, n, args.update, connection,
                         args.patchall)
        else:
            print("Sheet name '{name}' not part of supported object types!".
                  format(name=n),
                  file=sys.stderr)
예제 #57
0
def get_char_summary(lot, connection):

    anti = encodedcc.get_ENCODE(lot, connection, frame="embedded")
    charas = anti.get("characterizations", [])
    number_chars_in_progress = 0
    number_chars_passing = 0
    number_chars_failing = 0
    for c in charas:
        s = c["status"]
        if s == "in progress":
            number_chars_in_progress += 1
        elif s == "pending dcc review":
            number_chars_in_progress += 1
        elif s == 'exempt from standards':
            number_chars_passing += 1
        elif s == "compliant":
            number_chars_passing += 1
        else:
            number_chars_failing += 1
    char_dict = {"number_chars_in_progress": number_chars_in_progress,
                 "number_chars_passing": number_chars_passing,
                 "number_chars_failing": number_chars_failing
                 }
    return char_dict
예제 #58
0
def files(objList, fileCheckedItems, connection):
    for obj in objList:
        exp = encodedcc.get_ENCODE(obj, connection)
        if any(exp.get("files")):
            expfiles = exp["files"]
        else:
            expfiles = exp["original_files"]
        for f in expfiles:
            fileob = {}
            file = encodedcc.get_ENCODE(f, connection)
            for field in fileCheckedItems:
                fileob[field] = file.get(field)
            fileob["submitted_by"] = encodedcc.get_ENCODE(
                file["submitted_by"], connection)["title"]
            fileob["experiment"] = exp["accession"]
            fileob["experiment-lab"] = encodedcc.get_ENCODE(
                exp["lab"], connection)["name"]
            fileob["biosample"] = exp.get("biosample_term_name", "")
            fileob["flowcell"] = []
            fileob["lane"] = []
            fileob["Uniquely mapped reads number"] = ""
            fileob["biological_replicate"] = ""
            fileob["technical_replicate"] = ""
            fileob["replicate_id"] = ""
            if file.get("file_format", "") == "bam":
                for q in file.get("quality_metrics", []):
                    if "star-quality-metrics" in q:
                        star = encodedcc.get_ENCODE(q, connection)
                        fileob["Uniquely mapped reads number"] = star[
                            "Uniquely mapped reads number"]
            for fcd in file["flowcell_details"]:
                fileob["flowcell"].append(fcd.get("flowcell", ""))
                fileob["lane"].append(fcd.get("lane"))
            try:
                fileob["platform"] = encodedcc.get_ENCODE(
                    fileob["platform"], connection)["title"]
            except:
                fileob["platform"] = None
            if "replicates" in exp:
                temp_rep = encodedcc.get_ENCODE(exp["replicates"][0],
                                                connection)
                if "library" in temp_rep:
                    temp_lib = encodedcc.get_ENCODE(temp_rep["library"],
                                                    connection)
                    if "biosample" in temp_lib:
                        temp_bio = encodedcc.get_ENCODE(
                            temp_lib["biosample"], connection)
                        if "donor" in temp_bio:
                            temp_don = encodedcc.get_ENCODE(
                                temp_bio["donor"], connection)
                            if "organism" in temp_don:
                                temp_org = encodedcc.get_ENCODE(
                                    temp_don["organism"], connection)
                                fileob["species"] = temp_org["name"]
            else:
                fileob["species"] = ""
            if "replicate" in file:
                rep = encodedcc.get_ENCODE(file["replicate"], connection)
                fileob["biological_replicate"] = rep[
                    "biological_replicate_number"]
                fileob["technical_replicate"] = rep[
                    "technical_replicate_number"]
                fileob["replicate_id"] = rep["uuid"]
                if "library" in rep:
                    library = encodedcc.get_ENCODE(rep["library"], connection)
                    try:
                        fileob["library_aliases"] = library["aliases"]
                    except:
                        fileob["library_aliases"] = ""
                    if "biosample" in library:
                        bio = encodedcc.get_ENCODE(library["biosample"],
                                                   connection)
                        fileob["biosample_aliases"] = bio["aliases"]
            if any(exp.get("aliases", [])):
                fileob["alias"] = exp["aliases"][0]
            else:
                fileob["alias"] = ""
            row = []
            for j in fileCheckedItems:
                row.append(repr(fileob[j]))
            print('\t'.join(row))
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    assemblies = ['hg19', 'GRCh38']
    summary = []

    if args.infile is not None and 'ENCSR' in args.infile:
        objList = [args.infile]
    else:
        objList = get_experiment_list(args.infile, args.query, connection)

    for obj_id in objList:
        results = {}

        obj = get_ENCODE(obj_id, connection, frame='page')

        # Get basic info
        reps = get_replicate_count(obj)
        results['rep_count'] = len(reps)
        results['status'] = obj['status']
        results['internal_status'] = obj['internal_status']
        results['award'] = obj['award'].get('rfa')
        results['peaks'] = {}
        results['mapping'] = {}
        results['unarchived_files'] = []
        results['status issues'] = []
        results['accession'] = obj['accession']

        # Get audits
        for level in ['WARNING', 'ERROR', 'NOT_COMPLIANT', 'INTERNAL_ACTION']:
            if obj['audit'].get(level):
                results[level] = len(obj['audit'].get(level))

        # Get status issues
        actions = obj['audit'].get('INTERNAL_ACTION')
        if actions:
            status_issues = [
                i for i in actions if i['category'] in [
                    'experiment not submitted to GEO',
                    'mismatched file status', 'mismatched status'
                ]
            ]
            results['status issues'] = status_issues

        # Inspect files

        good_files = [
            f for f in obj['files']
            if f['status'] in ['released', 'in progress']
        ]
        fastqs = [
            f for f in obj['files']
            if f['status'] in ['released', 'in progress']
        ]
        print("There are files in this experiment:", len(obj['files']))
        print("There are good files in this experiment:", len(good_files))
        # look for unarchived processed files from other labs
        processed_files = [
            f for f in obj['files'] if f['file_format'] != 'fastq'
        ]
        external_files = [
            f for f in processed_files
            if (f['lab']['name'] != 'encode-processing-pipeline')
        ]
        unarchived_files = [
            f for f in external_files if (f['status'] != 'archived')
        ]
        results['unarchived_files'] = unarchived_files

        for assembly in assemblies:
            replicates = []
            file_list = [
                f for f in good_files if f.get('assembly') == assembly
            ]
            for rep in reps:
                rep_obj = {'rep': rep}
                file_list_rep = [
                    f for f in file_list
                    if rep in f.get('biological_replicates')
                ]
                aligns = [
                    f for f in file_list_rep
                    if f.get('output_type') == 'alignments'
                ]
                rep_obj['aligns'] = len(aligns)
                raw_aligns = [
                    f for f in file_list_rep
                    if f.get('output_type') == 'unfiltered alignments'
                ]
                rep_obj['raws'] = len(raw_aligns)
                replicates.append(rep_obj)
            failing_replicates = [f for f in replicates if f['aligns'] == 0]
            if len(failing_replicates) is 0:
                results['mapping'][assembly] = True
            elif len(replicates) == len(failing_replicates):  # They all fail
                results['mapping'][assembly] = False
            else:
                results['mapping'][assembly] = []
                for rep in failing_replicates:
                    results['mapping'][assembly].append(rep['rep'])

            peaks = [f for f in file_list if f.get('output_type') == 'peaks']
            if len(peaks) > 0:
                results['peaks'][assembly] = True
            else:
                results['peaks'][assembly] = False

        summary.append(results)

    unarchived_list = [r for r in summary if len(r['unarchived_files']) > 0]
    print('These experiments have unarchived files', len(unarchived_list))
    for item in unarchived_list:
        print(item['accession'])
    print('')
    print('')

    exps_mismatched_states = [
        r for r in summary if len(r['status issues']) > 0
    ]
    print('These experiments have mismatched states',
          len(exps_mismatched_states))
    for item in exps_mismatched_states:
        print(item['accession'])
    print('')
    print('')

    # not_mapped_GRCh38 = [r for r in summary if r['missing_aligns']['GRCh38'] is False]

    exps_missing_hg38_mapping = [
        r for r in summary if r['mapping']['GRCh38'] is False
    ]
    print('These experiments are missing GRCh38 mapping for all replicates',
          len(exps_missing_hg38_mapping))
    for item in exps_missing_hg38_mapping:
        print(item['accession'], item['status'], item['internal_status'])
    print('')
    print('')

    exps_partial_hg38_mapping = [
        r for r in summary if r['mapping']['GRCh38'] is not False
        and r['mapping']['GRCh38'] is not True
    ]
    print('These experiments are missing GRCh38 mapping for some replicates',
          len(exps_partial_hg38_mapping))
    for item in exps_partial_hg38_mapping:
        print(item['accession'], item['status'], item['internal_status'],
              item['mapping']['GRCh38'])
    print('')
    print('')

    exps_missing_hg38_peaks = [
        r for r in summary if r['peaks']['GRCh38'] is False
    ]
    exps_missing_hg38_peaks_but_have_mapping = [
        f for f in exps_missing_hg38_peaks
        if f['peaks']['GRCh38'] is False and f not in exps_missing_hg38_mapping
        and f not in exps_partial_hg38_mapping
    ]
    print('These experiments are missing GRCh38 peaks but having all mappings',
          len(exps_missing_hg38_peaks_but_have_mapping))
    for item in exps_missing_hg38_peaks:
        print(item['accession'], item['status'], item['internal_status'])
    print('')
    print('')

    exps_missing_hg19_mapping = [
        r for r in summary if r['mapping']['hg19'] is False
    ]
    print('These experiments are missing hg19 mapping for all replicates',
          len(exps_missing_hg19_mapping))
    for item in exps_missing_hg19_mapping:
        print(item['accession'], item['status'], item['internal_status'])
    print('')
    print('')

    exps_partial_hg19_mapping = [
        r for r in summary if r['mapping']['hg19'] is not False
        and r['mapping']['hg19'] is not True
    ]
    print('These experiments are missing hg19 mapping for some replicates',
          len(exps_partial_hg19_mapping))
    for item in exps_partial_hg19_mapping:
        print(item['accession'], item['status'], item['internal_status'],
              item['mapping']['hg19'])
    print('')
    print('')

    exps_missing_hg19_peaks = [
        r for r in summary
        if r['peaks']['hg19'] is False and r not in exps_missing_hg19_mapping
        and r not in exps_partial_hg19_mapping
    ]
    print('These experiments are missing hg19 peaks',
          len(exps_missing_hg19_peaks))
    for item in exps_missing_hg19_peaks:
        print(item['accession'], item['status'], item['internal_status'],
              'warnings:', item.get('WARNING'))
    print('')
    print('')
def main():
    args = getArgs()
    key = encodedcc.ENC_Key(args.keyfile, args.key)
    connection = encodedcc.ENC_Connection(key)
    accessions = []
    if args.infile:
        if os.path.isfile(args.infile):
            accessions = [line.rstrip('\n') for line in open(args.infile)]
        else:
            accessions = args.infile.split(",")
    elif args.query:
        if "search" in args.query:
            temp = encodedcc.get_ENCODE(args.query, connection).get(
                "@graph", [])
        else:
            temp = [encodedcc.get_ENCODE(args.query, connection)]
        if any(temp):
            for obj in temp:
                if obj.get("accession"):
                    accessions.append(obj["accession"])
                elif obj.get("uuid"):
                    accessions.append(obj["uuid"])
                elif obj.get("@id"):
                    accessions.append(obj["@id"])
                elif obj.get("aliases"):
                    accessions.append(obj["aliases"][0])
    if len(accessions) == 0:
        # if something happens and we end up with no accessions stop
        print("ERROR: object has no identifier", file=sys.stderr)
        sys.exit(1)
    else:
        dataList = []
        for acc in accessions:
            obj = encodedcc.get_ENCODE(acc, connection, frame="embedded")
            isValid = True
            check = ["replicates", "files"]
            for c in check:
                if not obj.get(c):
                    if args.debug:
                        print(
                            "Missing {} for {}".format(c, acc),
                            file=sys.stderr)
                    isValid = False
            if obj.get("possible_controls"):
                for p in obj["possible_controls"]:
                    for c in check:
                        if not obj.get(c):
                            if args.debug:
                                print(
                                    "Missing {} for {}".format(
                                        c, p["accession"]),
                                    file=sys.stderr)
                            isValid = False
            else:
                isValid = False
                if args.debug:
                    print(
                        "Missing possible_controls for {}".format(acc),
                        file=sys.stderr)
            if isValid:
                b = BackFill(
                    connection,
                    dataList,
                    debug=args.debug,
                    missing=args.missing)
                if args.method == "single":
                    b.single_rep(obj)
                    if args.debug:
                        print("SINGLE REP {}".format(acc))
                elif args.method == "multi":
                    b.multi_rep(obj, args.ignore_runtype)
                    if args.debug:
                        print("MULTI REP {}".format(acc))
                elif args.method == "biosample":
                    b.multi_control(obj)
                    if args.debug:
                        print("BIOSAMPLE {}".format(acc))
                else:
                    exp_rep = len(obj["replicates"])
                    exp_con = len(obj["possible_controls"])
                    if exp_con == 1:
                        # one possible control
                        con_rep = len(
                            obj["possible_controls"][0]["replicates"])
                        if con_rep == exp_rep:
                            # same number experiment replicates as control replicates
                            # method is multi
                            b.multi_rep(obj, args.ignore_runtype)
                            if args.debug:
                                print("MULTI REP {}".format(acc))
                        elif con_rep == 1:
                            # one control replicate and multiple experiment replicates
                            # method is single
                            b.single_rep(obj)
                            if args.debug:
                                print("SINGLE REP {}".format(acc))
                        else:
                            if args.debug:
                                print(
                                    "Experiment {} contains {} experiment replicates and {} control replicates and so does not fit the current pattern!"
                                    .format(acc, exp_rep, con_rep))
                    elif exp_con > 1:
                        # more than one possible control
                        con_reps = 0
                        for con in obj["possible_controls"]:
                            if len(con["replicates"]) == 1:
                                con_reps += 1
                        if con_reps == exp_rep:
                            # same number of controls with one replicate as number of experiment replicates
                            # method is biosample
                            b.multi_control(obj)
                            if args.debug:
                                print("BIOSAMPLE {}".format(acc))
                        else:
                            if args.debug:
                                print(
                                    "Experiment {} contains {} experiment replicates and {} control replicates between {} total controls and so does not fit the current pattern!"
                                    .format(acc, exp_rep, con_rep, exp_con))
                    else:
                        if args.debug:
                            print(
                                "Experiment {} does not fit any of the current patterns!"
                                .format(acc))

        if len(dataList) > 0:
            print(
                "Experiment Accession\tCheck Type\tControl Files\tExperiment Files"
            )
            for d in dataList:
                print("{}\t{}\t{}\t{}".format(d["Exp Accession"],
                                              d["Check type"], d["Control"],
                                              d["Experiment"]))