def main():
    argparser = get_fastqc.get_args()

    argparser.add_argument("--skipfq", help="Skip parsing fastqc", action="store_true", required=False)

    args = argparser.parse_args()

    project = dxencode.get_project(args.project)
    print "\t".join(["Experiment", "Replicate"] + labels + ["lambda " + l for l in labels] + ["Estimated Coverage"])
    if args.experiment:
        process_exp(args.experiment, project, skipfq=args.skipfq)
    elif args.all:
        assay = args.assay or "OBI:0001863"
        query = (
            "/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&files.file_format=fastq"
            % assay
        )

        res = dxencode.encoded_get(SERVER + query, AUTHID=AUTHID, AUTHPW=AUTHPW)
        exps = res.json()["@graph"]

        for exp in exps:
            acc = exp["accession"]
            if len(exp["replicates"]) > 0:
                process_exp(acc, project, skipfq=args.skipfq)
def main():
    argparser = get_fastqc.get_args()

    argparser.add_argument('--skipfq',
                           help='Skip parsing fastqc',
                           action='store_true',
                           required=False)

    args = argparser.parse_args()

    project = dxencode.get_project(args.project)
    print "\t".join(['Experiment', 'Replicate'] + labels +
                    ["lambda " + l for l in labels] + ['Estimated Coverage'])
    if args.experiment:
        process_exp(args.experiment, project, skipfq=args.skipfq)
    elif args.all:
        assay = args.assay or "OBI:0001863"
        query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&files.file_format=fastq' % assay

        res = dxencode.encoded_get(SERVER + query,
                                   AUTHID=AUTHID,
                                   AUTHPW=AUTHPW)
        exps = res.json()['@graph']

        for exp in exps:
            acc = exp['accession']
            if len(exp['replicates']) > 0:
                process_exp(acc, project, skipfq=args.skipfq)
def main():
    argparser = get_args()
    args = argparser.parse_args()

    project = dxencode.get_project(args.project)

    if args.file:
        getr = dxencode.encoded_get(SERVER+args.file, AUTHID=AUTHID, AUTHPW=AUTHPW)
        try:
            getr.raise_for_status()
        except:
            print "Could not find %s in db" % args.file
            raise
        encff = getr.json()
        metrics = get_fastqc(encff['accession'], project)
        print json.dumps(metrics)
    elif args.experiment:
        get_exp_time(args.experiment, project)
    elif args.all:
        assay = args.assay or "OBI:0001271"
        query = '/search/?type=experiment&assay_term_id=%s&award.rfa=ENCODE3&limit=all&frame=embedded&replicates.library.biosample.donor.organism.name=mouse&files.file_format=fastq' % assay

        res = dxencode.encoded_get(SERVER+query, AUTHID=AUTHID, AUTHPW=AUTHPW)
        exps = res.json()['@graph']

        for exp in exps:
            acc = exp['accession']
            if len(exp['replicates']) > 0:
                if exp['replicates'][0]['library'].get('size_range', "") != '>200':
                    print "Skipping %s with wrong library size (%s)" % (acc, exp['replicates'][0]['library'].get('size_range', ""))
                    #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                    continue
                if exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', "") == "cells":
                    ncells = float(exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity', 0.0))
                    if ncells < 20:
                        print "Skipping %s as single-cell (%s %s)" % (acc, exp['replicates'][0]['library'].get('nucleic_acid_starting_quantity_units', ""), ncells)
                        #print json.dumps(exp['replicates'][0]['library'], sort_keys=True, indent=4, separators=(',',': '))
                        continue
            get_exp_time(acc, project)
Exemplo n.º 4
0
def main():

    args = get_args()

    (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default')
    url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment)
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    exp = response.json()

    if not exp.get('replicates') or len(exp['replicates']) < 1:
        print "No replicates found in %s\n%s" % ( args.experiment, exp )
        sys.exit(1)

    replicate = "%s_%s" % (args.br, args.tr)

    reps_mapping = dxencode.choose_mapping_for_experiment(exp)
    # could try to do all replicates here
    try:
        mapping = reps_mapping[(args.br,args.tr)]
    except KeyError:
        print "Specified replicate: %s could not be found in mapping." % replicate
        print reps_mapping
        sys.exit(1)

    if args.maplambda:
        genome = 'lambda'
    else:
        if mapping['organism'] == 'mouse':
            genome = 'mm10'
        elif mapping['organism'] == 'human':
            genome = 'hg19'
        else:
            print "Organism %s not currently supported" % mapping['organism']
            sys.exit(1)

    if mapping['unpaired'] and not mapping['paired']:
        pairedEnd = False
    elif mapping['paired'] and not mapping['unpaired']:
        pairedEnd = True
    elif not mapping['unpaired'] and not mapping['paired']:
        print "Replicate has no reads either paired or unpaired"
        print mapping
        sys.exit(1)
    else:
        print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired'])))
        print mapping
        sys.exit(1)

    extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment, replicate, mapping['library'], pairedEnd, args.gzip)
    project = dxencode.get_project(args.project)
    projectId = project.get_id()

    #    args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome
    args.resultsLoc = RESULT_FOLDER_DEFAULT  # not sure we need genome
    resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate
    if args.maplambda:
        resultsFolder = resultsFolder + '/lambda'
    if not args.test:
        if not dxencode.project_has_folder(project, resultsFolder):
            project.new_folder(resultsFolder,parents=True)

    if pairedEnd:
        paired_fqs = {
            '1': [],
            '2': []
        }
        for (p1, p2) in mapping['paired']:
            paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz")
            paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz")
        steps = STEP_ORDER['pe']
        print "Generating workflow steps (paired-end)..."
    else:
        unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ]
        steps = STEP_ORDER['se']
        print "Generating workflow steps (single-end)..."
    for step in steps:
        STEPS[step] = calculate_steps(step)

    print "Checking for prior results..."
    # Check if there are previous results
    # Perhaps reads files are already there?
    # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo
    #       and fill in inputs to workflow steps
    priors = findPriorResults(pairedEnd,resultsFolder,projectId, maplambda=True)

    print "Checking for read files..."
    # Find all reads files and move into place
    # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/...
    #       or possibly local, Currently only DX locations are supported.
    if pairedEnd:
        reads1 = dxencode.find_and_copy_read_files(priors, paired_fqs['1'], args.test, 'pair1_reads', resultsFolder, arrayInput=True, projectId=projectId)
        reads2 = dxencode.find_and_copy_read_files(priors, paired_fqs['2'], args.test, 'pair2_reads', resultsFolder, arrayInput=True, projectId=projectId)
    else:
        # trim-se and trim-pe use different input tokens.
        reads1 = dxencode.find_and_copy_read_files(priors, unpaired_fqs, args.test, 'reads', resultsFolder, arrayInput=True, projectId=projectId)

    print "Looking for reference files..."
    findReferenceFiles(GENOME_REFERENCES.keys(), priors,args.refLoc,extras)

    print "Determining steps to run..."
    # NOTE: stepsToDo is an ordered list of steps that need to be run
    deprecateFiles = [] # old results will need to be moved/removed if step is rerun
    stepsToDo = determineStepsToDo(pairedEnd, priors, deprecateFiles, projectId, force=args.force)

    # Report the plans
    print "Running '"+extras['title']+"'"
    print "     on "+extras['subTitle']
    if pairedEnd:
        print "- Reads1: "
    else:
        print "- Reads: "
    for fid in reads1:
        print "  " + dxencode.file_path_from_fid(fid)
    if pairedEnd:
        print "- Reads2: "
        for fid in reads2:
            print "  " + dxencode.file_path_from_fid(fid)
    print "- Reference files:"
    for token in GENOME_REFERENCES.keys():
        print "  " + dxencode.file_path_from_fid(priors[token],True)
    print "- Results written to: " + args.project + ":" +resultsFolder
    if len(stepsToDo) == 0:
        print "* All expected results are in the results folder, so there is nothing to do."
        print "  If this experiment/replicate needs to be rerun, then use the --force flag to "
        print "  rerun all steps; or remove suspect results from the folder before launching."
        sys.exit(0)
    else:
        print "- Steps to run:"
        steps = []
        if pairedEnd:
            steps = STEP_ORDER['pe']
        else:
            steps = STEP_ORDER['se']
        for step in steps:
            STEPS[step] = calculate_steps(step)
            if step in stepsToDo:
                print "  * "+STEPS[step]['app']+" will be run"
            else:
                if not step.find('concat') == 0:
                    print "    "+STEPS[step]['app']+" has already been run"

    print "Checking for currently running analyses..."
    checkRunsPreviouslyLaunched(resultsFolder,projectId)

    if len(deprecateFiles) > 0:
        if args.test:
            print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                    resultsFolder+"/deprecated'."
            for fid in deprecateFiles:
                print "  " + dxencode.file_path_from_fid(fid)
        else:
            print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                resultsFolder+"/deprecated'..."
            dxencode.move_files(deprecateFiles,resultsFolder+"/deprecated",projectId)

    # Exit if test only
    if args.test:
        print "TEST ONLY - exiting."
        sys.exit(0)

    print "Launch sequence initiating..."
    wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder,projectId)

    print "  We have liftoff!"
    logThisRun(wfRun['id'],resultsFolder,projectId)

    print "  Launched " + wfRun['id']
    print "(success)"
Exemplo n.º 5
0
def main():
    args = get_args()

    (AUTHID,AUTHPW,SERVER) = dxencode.processkey('default')
    url = SERVER + 'experiments/%s/?format=json&frame=embedded' %(args.experiment)
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    exp = response.json()

    if not exp.get('replicates') or len(exp['replicates']) < 1:
        print "No replicates found in %s\n%s" % ( args.experiment, exp )
        sys.exit(1)

    #replicate = "rep%s_%s" % (args.br, args.tr)
    replicate = "%s_%s" % (args.br, args.tr)

    reps_mapping = dxencode.choose_mapping_for_experiment(exp)
    # could try to do all replicates here
    try:
        mapping = reps_mapping[(args.br,args.tr)]
    except KeyError:
        print "Specified replicate: %s could not be found in mapping." % replicate
        print reps_mapping
        sys.exit(1)

    mapping['replicate'] = replicate

    try:
        mapping['genome'] = GENOME_MAPPING[mapping.get('organism', "Not Found")]

    except KeyError:
        print "Organism %s not currently supported" % mapping['organism']
        sys.exit(1)

    if mapping['unpaired'] and not mapping['paired']:
        pairedEnd = False
    elif mapping['paired'] and not mapping['unpaired']:
        pairedEnd = True
    elif not mapping['unpaired'] and not mapping['paired']:
        print "Replicate has no reads either paired or unpaired"
        print mapping
        sys.exit(1)
    else:
        print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (len(mapping['paired'], len(mapping['unpaired'])))
        print mapping
        sys.exit(1)

    psv = pipeline_specific_vars(args, mapping, pairedEnd)
    project = dxencode.get_project(args.project)
    projectId = project.get_id()


    ## TODO this is a bunch of ugly
    if pairedEnd:
        paired_fqs = {
            '1': [],
            '2': []
        }
        read1s = []
        read2s = []
        for (p1, p2) in mapping['paired']:
            paired_fqs[p1['paired_end']].append(p1['accession']+".fastq.gz")
            paired_fqs[p2['paired_end']].append(p2['accession']+".fastq.gz")
            read1s.append(p1['accession'])
            read2s.append(p2['accession'])
        pipePath = STEP_ORDER['pe']
        print "Generating workflow steps (paired-end)..."
    else:
        unpaired_fqs = [ f['accession']+".fastq.gz" for f in mapping['unpaired'] ]
        pipePath = STEP_ORDER['se']

    for step in pipePath:
        STEPS[step] = calculate_steps(step)

    pipeSteps = STEPS
    ## warning ugly kludge here
    file_globs = {}
    for app in STEPS.keys():
        for token in STEPS[app]['results'].keys():
            file_globs[token] = STEPS[app]['results'][token]

    print "Checking for prior results..."

    priors = dxencode.find_prior_results(pipePath,pipeSteps,psv['resultsFolder'],file_globs, projectId)

    if pairedEnd:
        priors['pair1_reads'] = dxencode.find_file_set(paired_fqs["1"], projectId)
        priors['pair2_reads'] = dxencode.find_file_set(paired_fqs["2"], projectId)
        priors['all_reads'] = priors['pair1_reads'] + priors['pair2_reads']
        submitted = {
            'all_reads': read1s + read2s
        }
    else:
        priors['reads'] = dxencode.find_file_set(unpaired_fqs, projectId)
        priors['all_reads'] = priors['reads']
        submitted = {
            'all_reads': [ f['accession'] for f in mapping['unpaired']],
        }


    print "Determining steps to run..."
    #print priors
    #sys.exit(1)
    # NOTE: stepsToDo is an ordered list of steps that need to be run
    deprecateFiles = [] # old results will need to be moved/removed if step is rerun
    stepsToDo = dxencode.determine_steps_to_run(pipePath,pipeSteps, priors, deprecateFiles, projectId, verbose=True)

    print "Checking for currently running analyses..."
    dxencode.check_run_log(psv['resultsFolder'],projectId, verbose=True)

    if len(stepsToDo):
        print "Pipeline incomplete, please resubmit jobs: %s" % stepsToDo
        sys.exit(0)

    print priors
    to_submit = [ k for k in priors.keys() if POST_TEMPLATES.get(k) ]
    n = 0 # skip reads
    print "Attempting to submit %s files to args.experiment" % len(to_submit)
    while(to_submit):
        if n > len(priors) * len(priors):
            print "Too many itereations: %s" % priors
            break
        token = to_submit.pop(0)
        print "%s %s - %s" % (token, priors[token], n)
        f_ob = POST_TEMPLATES.get(token, None)
        n += 1
        if f_ob:
            derive_check = f_ob.get('derived_from', [])
            if derive_check:
                derived = [ submitted[f] for f in derive_check if submitted.get(f) ]
                if not derived:
                    to_submit.append(token)
                    continue
                else:
                    f_ob['derived_from'] = list(itertools.chain(*derived))
            dxFile = dxpy.DXFile(dxid=priors[token])
            print "Post File: %s %s" % (token, dxFile.name)
            f_ob['dataset'] = args.experiment
            f_ob['lab'] = '/labs/j-michael-cherry/'
            f_ob['award'] = '/awards/U41HG006992/'
            f_ob['assembly'] = mapping['genome']
            ## temporary haxors until file display works
            f_ob['replicate'] = mapping['replicate_id']
            f_ob['notes'] = json.dumps(dxencode.create_notes(dxFile, get_software()))
            print json.dumps(f_ob, sort_keys=True, indent=4, separators=(',',': '))
            if args.testserver:
                server = 'test'
            else:
                server = 'www'

            if args.test:
                fake_acc = 'ENCFF%03dAAA' % n
                print "Fake submission: %s" % fake_acc
                submitted[token] = [ fake_acc ]
            else:
                applet = dxencode.find_applet_by_name('validate-post', projectId )
                job = applet.run({
                    "pipe_file": dxpy.dxlink(dxFile),
                    "file_meta": f_ob,
                    "key": server,
                    "debug": True,
                    "skipvalidate": args.skipvalidate or False
                    })
                print "Submitting %s" % job.id
                job.wait_on_done(interval=1)
                accession = job.describe()['output'].get('accession', "Unknown Acc")
                error = job.describe()['output'].get('error', "Unknown Error")
                submitted[token] = [ accession ]
                print "Posted (%s): %s" % (error, accession)

    # Exit if test only
    if args.test:
        print "Fake submitted %s files." % n
    if args.test:
        sys.exit(0)
Exemplo n.º 6
0
def main():

    args = get_args()

    (AUTHID, AUTHPW, SERVER) = dxencode.processkey('default')
    url = SERVER + 'experiments/%s/?format=json&frame=embedded' % (
        args.experiment)
    response = dxencode.encoded_get(url, AUTHID, AUTHPW)
    exp = response.json()

    if not exp.get('replicates') or len(exp['replicates']) < 1:
        print "No replicates found in %s\n%s" % (args.experiment, exp)
        sys.exit(1)

    replicate = "%s_%s" % (args.br, args.tr)

    reps_mapping = dxencode.choose_mapping_for_experiment(exp)
    # could try to do all replicates here
    try:
        mapping = reps_mapping[(args.br, args.tr)]
    except KeyError:
        print "Specified replicate: %s could not be found in mapping." % replicate
        print reps_mapping
        sys.exit(1)

    if args.maplambda:
        genome = 'lambda'
    else:
        if mapping['organism'] == 'mouse':
            genome = 'mm10'
        elif mapping['organism'] == 'human':
            genome = 'hg19'
        else:
            print "Organism %s not currently supported" % mapping['organism']
            sys.exit(1)

    if mapping['unpaired'] and not mapping['paired']:
        pairedEnd = False
    elif mapping['paired'] and not mapping['unpaired']:
        pairedEnd = True
    elif not mapping['unpaired'] and not mapping['paired']:
        print "Replicate has no reads either paired or unpaired"
        print mapping
        sys.exit(1)
    else:
        print "Replicate has both paired(%s) and unpaired(%s) reads, quitting." % (
            len(mapping['paired'], len(mapping['unpaired'])))
        print mapping
        sys.exit(1)

    extras = pipelineSpecificExtras(genome, mapping['sex'], args.experiment,
                                    replicate, mapping['library'], pairedEnd,
                                    args.gzip)
    project = dxencode.get_project(args.project)
    projectId = project.get_id()

    #    args.resultsLoc = RESULT_FOLDER_DEFAULT + '/' + genome
    args.resultsLoc = RESULT_FOLDER_DEFAULT  # not sure we need genome
    resultsFolder = args.resultsLoc + '/' + args.experiment + '/' + replicate
    if args.maplambda:
        resultsFolder = resultsFolder + '/lambda'
    if not args.test:
        if not dxencode.project_has_folder(project, resultsFolder):
            project.new_folder(resultsFolder, parents=True)

    if pairedEnd:
        paired_fqs = {'1': [], '2': []}
        for (p1, p2) in mapping['paired']:
            paired_fqs[p1['paired_end']].append(p1['accession'] + ".fastq.gz")
            paired_fqs[p2['paired_end']].append(p2['accession'] + ".fastq.gz")
        steps = STEP_ORDER['pe']
        print "Generating workflow steps (paired-end)..."
    else:
        unpaired_fqs = [
            f['accession'] + ".fastq.gz" for f in mapping['unpaired']
        ]
        steps = STEP_ORDER['se']
        print "Generating workflow steps (single-end)..."
    for step in steps:
        STEPS[step] = calculate_steps(step)

    print "Checking for prior results..."
    # Check if there are previous results
    # Perhaps reads files are already there?
    # NOTE: priors is a dictionary of fileIds that will be used to determine stepsToDo
    #       and fill in inputs to workflow steps
    priors = findPriorResults(pairedEnd,
                              resultsFolder,
                              projectId,
                              maplambda=True)

    print "Checking for read files..."
    # Find all reads files and move into place
    # TODO: files could be in: dx (usual), remote (url e.g.https://www.encodeproject.org/...
    #       or possibly local, Currently only DX locations are supported.
    if pairedEnd:
        reads1 = dxencode.find_and_copy_read_files(priors,
                                                   paired_fqs['1'],
                                                   args.test,
                                                   'pair1_reads',
                                                   resultsFolder,
                                                   arrayInput=True,
                                                   projectId=projectId)
        reads2 = dxencode.find_and_copy_read_files(priors,
                                                   paired_fqs['2'],
                                                   args.test,
                                                   'pair2_reads',
                                                   resultsFolder,
                                                   arrayInput=True,
                                                   projectId=projectId)
    else:
        # trim-se and trim-pe use different input tokens.
        reads1 = dxencode.find_and_copy_read_files(priors,
                                                   unpaired_fqs,
                                                   args.test,
                                                   'reads',
                                                   resultsFolder,
                                                   arrayInput=True,
                                                   projectId=projectId)

    print "Looking for reference files..."
    findReferenceFiles(GENOME_REFERENCES.keys(), priors, args.refLoc, extras)

    print "Determining steps to run..."
    # NOTE: stepsToDo is an ordered list of steps that need to be run
    deprecateFiles = [
    ]  # old results will need to be moved/removed if step is rerun
    stepsToDo = determineStepsToDo(pairedEnd,
                                   priors,
                                   deprecateFiles,
                                   projectId,
                                   force=args.force)

    # Report the plans
    print "Running '" + extras['title'] + "'"
    print "     on " + extras['subTitle']
    if pairedEnd:
        print "- Reads1: "
    else:
        print "- Reads: "
    for fid in reads1:
        print "  " + dxencode.file_path_from_fid(fid)
    if pairedEnd:
        print "- Reads2: "
        for fid in reads2:
            print "  " + dxencode.file_path_from_fid(fid)
    print "- Reference files:"
    for token in GENOME_REFERENCES.keys():
        print "  " + dxencode.file_path_from_fid(priors[token], True)
    print "- Results written to: " + args.project + ":" + resultsFolder
    if len(stepsToDo) == 0:
        print "* All expected results are in the results folder, so there is nothing to do."
        print "  If this experiment/replicate needs to be rerun, then use the --force flag to "
        print "  rerun all steps; or remove suspect results from the folder before launching."
        sys.exit(0)
    else:
        print "- Steps to run:"
        steps = []
        if pairedEnd:
            steps = STEP_ORDER['pe']
        else:
            steps = STEP_ORDER['se']
        for step in steps:
            STEPS[step] = calculate_steps(step)
            if step in stepsToDo:
                print "  * " + STEPS[step]['app'] + " will be run"
            else:
                if not step.find('concat') == 0:
                    print "    " + STEPS[step]['app'] + " has already been run"

    print "Checking for currently running analyses..."
    checkRunsPreviouslyLaunched(resultsFolder, projectId)

    if len(deprecateFiles) > 0:
        if args.test:
            print "Would move "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                    resultsFolder+"/deprecated'."
            for fid in deprecateFiles:
                print "  " + dxencode.file_path_from_fid(fid)
        else:
            print "Moving "+str(len(deprecateFiles))+" prior result file(s) to '" + \
                                                                resultsFolder+"/deprecated'..."
            dxencode.move_files(deprecateFiles, resultsFolder + "/deprecated",
                                projectId)

    # Exit if test only
    if args.test:
        print "TEST ONLY - exiting."
        sys.exit(0)

    print "Launch sequence initiating..."
    wfRun = createWorkflow(stepsToDo, priors, extras, resultsFolder, projectId)

    print "  We have liftoff!"
    logThisRun(wfRun['id'], resultsFolder, projectId)

    print "  Launched " + wfRun['id']
    print "(success)"