def allCountsForOneFileAsString(allCounts):
    shared_pipe.init()
    build_line = ""
    counts = ['rdata', 'counted']
    counts.extend(shared_pipe.NAMED_CUTOFFS.keys())
    for one_count in counts:
        build_line += " " + one_count + " :" +  str(allCounts[one_count])
    return build_line
def getEmptyCounts():
    shared_pipe.init()
    #The status 'es_skipped' should ALWAYS indicate that a record with that
    #ID was already present in the Elasticsearch index.
        #cutoff total is the number of records that meets the pvalue cutoff.
        #if pvalue cutoffs are disable, this number will be 0.
    counts = ['COMPLETE', 'IN_PROGRESS', 'NOT_STARTED',
              'rdata', 'cutoff_total', 'es_added', 'es_skipped', 'other']
    return dict.fromkeys(counts, 0)
def readOneProgressFile(progFilePath):
    shared_pipe.init()
    summary = getEmptyCounts()
    progFile = open(progFilePath, 'r')
    fileLines = []
    for line in progFile:
        fileLines.append(line)
        terms = line.split()
        summary.update(addCountsFromOneLine(terms, summary))
    progFile.close()
    return { "summary" : summary,  
             "fileLines" : fileLines }
def checkAllDirs(parent):
    shared_pipe.init() 

    #number of job dirs per condor submit file.
    submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \
                 (shared_pipe.SETTINGS['n_submit_files'])
    completedDirs = 0

    overallProgressFile = open('progress.txt', 'ar+')
    #should be regenerated once per status check run, regardless of which 
    #data sets are being checked.

    overallSummary = getEmptyCounts()
    k = 0; completeDirsForBatch = 0 #k counts directories in a batch.
    whichBatch = 0 
    batches = setup_list_of_submit_files(shared_pipe.SETTINGS['n_submit_files'])
     
    for i in range(0, shared_pipe.SETTINGS['chunk_count']):
        dirName = '/'.join([parent, 'chunk' + str(i).zfill(2)])
        dirResults = checkOneDirectory(dirName)
        overallSummary = update_overall_counts(overallSummary, 
                                               dirResults['summary'])
        #writeToFileAndPrint(dirResults['fileLines'], overallProgressFile)

        #overallProgressFile.writelines(dirResults['fileLines'])
        k += 1 #This will get reset once per batch.
        if checkForCompleteDir(dirResults['summary']):
            completedDirs += 1 
            completeDirsForBatch += 1 

        #End of batch turn over all per-batch calculations.
        if k == submitSize: 
            if completeDirsForBatch == submitSize: 
                batches[whichBatch] = True        
            k = 0; completeDirsForBatch = 0; whichBatch += 1 

    how_many_complete_batches = show_which_batches_are_done(batches)
    writeToFileAndPrint( "\tcompleted " + str(completedDirs) + " out of " + \
          str(shared_pipe.SETTINGS['chunk_count']) + " directories.",        \
         overallProgressFile)
    writeToFileAndPrint("\tcompleted " + str(how_many_complete_batches) + " out of " +\
          str(shared_pipe.SETTINGS['n_submit_files']) + " submit files.",             \
          overallProgressFile)

    #pretty sure that we should use the submit file called 'completedChunks' next.
    writeToFileAndPrint('  summary for all active jobs:' ,  overallProgressFile)
    writeToFileAndPrint(str(overallSummary) , overallProgressFile)
    overallProgressFile.close()
def setupJobDirs(input_path):
    shared_pipe.init() 
    filesToCopy = ['multi_pipeline.py', 
                   'rdata2sqlite.R', 
                   'count_records_matching_cutoffs.py',
                   'shared_pipe.py']

    for i in range(0, shared_pipe.SETTINGS['chunk_count']):
        jobDir = 'chunk' + str(i).zfill(2)
        if not os.path.exists(jobDir):
            os.makedirs(jobDir)
        else:
            print "Directory : " + jobDir + " already exists."
            print "delete all chunk* directories to use this script."
            exit(1)
        for oneFile in filesToCopy:
            shutil.copyfile(oneFile, jobDir + "/" + oneFile)

    fList = get_file_list(input_path)    
    chunk_size = float(len(fList)) /    \
                float((shared_pipe.SETTINGS['chunk_count']))
    chunk_size = math.ceil(chunk_size)
    print "chunk count = " + str(shared_pipe.SETTINGS['chunk_count'])
    print "this many files in the list: " + str(len(fList))
    print "chunk size: " + str(chunk_size)

    progressFile = None 
    chunk = 0 

    #should list files 0 to N-1. Not 1 to N
    for index, oneFile in enumerate(fList):
       #print "index = " + str(index)
       if index % chunk_size == 0 or index == 0: 
           if progressFile is not None: 
               progressFile.close()
               chunk += 1
           progPath = 'chunk' + str(chunk).zfill(2) + '/progress.txt'
           print "setting up a new progress file: " + progPath
           progressFile = open(progPath, 'ar+')
       oneLine = " ".join([oneFile, 
                           str(shared_pipe.PROGRESS_STATES['NOT_STARTED']),
                           "\n"])
       progressFile.write(oneLine) 

    if progressFile is not None:
        progressFile.close()
def readOneProgressFile(progFilePath):
    shared_pipe.init()
    summary = { 'COMPLETE' : 0, 'IN_PROGRESS' : 0, 'NOT_STARTED' : 0 } 
    progFile = open(progFilePath,  'r')
    fileLines = []
    for line in progFile:
        fileLines.append(line)
        terms = line.split()
        if int(terms[1]) == shared_pipe.PROGRESS_STATES['COMPLETE']:
            summary['COMPLETE'] += 1
        elif int(terms[1]) == shared_pipe.PROGRESS_STATES['IN_PROGRESS']:
            summary['IN_PROGRESS'] += 1
        elif int(terms[1]) == shared_pipe.PROGRESS_STATES['NOT_STARTED']:
           summary['NOT_STARTED'] += 1
    progFile.close()
    return { "summary" : summary,  
             "fileLines" : fileLines }
def addCountsFromOneLine(terms, summary):
    shared_pipe.init()   #can this be factored to 1 call at the top of the file? 
    pgs = dict(zip(shared_pipe.PROGRESS_STATES.values(), 
                   shared_pipe.PROGRESS_STATES.keys() ) ) 
    status = pgs[int(terms[1])]
    summary[status] += 1

    if status == 'COMPLETE':
        specific_counts = terms[2:]
        #print "specific counts: " + repr(specific_counts)
        i = 0
        while i < len(specific_counts):
           value_called = specific_counts[i]
           #print "value called " + value_called
           value = int(specific_counts[i+1].replace(':', ''))
           #print "value : "  + str(value)
           summary[value_called] += value
           i += 2 
    return summary 
def setupJobDirs(input_path):
    shared_pipe.init() 
    filesToCopy = ['snp_info_pipeline.py', 
                   'process_snpinfo.R', 
                   'put_snp_info_in_es.py', 
                   'shared_pipe.py']

    for i in range(0, shared_pipe.SETTINGS['chunk_count']):
        print "setting up directory for " + str(i)
        jobDir = 'chunk' + str(i).zfill(2)
        if not os.path.exists(jobDir):
            os.makedirs(jobDir)
        else:
            print "Directory : " + jobDir + " already exists."
            print "delete all chunk* directories to use this script."
            exit(1)
        for oneFile in filesToCopy:
            shutil.copyfile(oneFile, jobDir + "/" + oneFile)


    fList = get_file_list(input_path)    
    print "fList :" + repr(fList)
    chunk_size = len(fList) /    \
                (shared_pipe.SETTINGS['chunk_count'] - 1)
    progressFile = None 
    chunk = 0 

    for index, oneFile in enumerate(fList):
       if index % chunk_size == 0 or index == 0: 
           if progressFile is not None: 
               progressFile.close()
               chunk += 1
           progPath = '/'.join(['chunk' + str(chunk).zfill(2), 'progress.txt'])
           progressFile = open(progPath, 'ar+')
       oneLine = " ".join([oneFile, 
                           str(shared_pipe.PROGRESS_STATES['NOT_STARTED']),
                           "\n"])
       progressFile.write(oneLine) 

    progressFile.close()  #close the last progress file that is written.
    print "this many files in the list: " + str(len(fList))
    print "chunk size: " + str(chunk_size)
def setupJobDirs(input_path):
    shared_pipe.init()
    filesToCopy = [
        'snp_info_pipeline.py', 'process_snpinfo.R', 'put_snp_info_in_es.py',
        'shared_pipe.py'
    ]

    for i in range(0, shared_pipe.SETTINGS['chunk_count']):
        print "setting up directory for " + str(i)
        jobDir = 'chunk' + str(i).zfill(2)
        if not os.path.exists(jobDir):
            os.makedirs(jobDir)
        else:
            print "Directory : " + jobDir + " already exists."
            print "delete all chunk* directories to use this script."
            exit(1)
        for oneFile in filesToCopy:
            shutil.copyfile(oneFile, jobDir + "/" + oneFile)

    fList = get_file_list(input_path)
    print "fList :" + repr(fList)
    chunk_size = len(fList) /    \
                (shared_pipe.SETTINGS['chunk_count'] - 1)
    progressFile = None
    chunk = 0

    for index, oneFile in enumerate(fList):
        if index % chunk_size == 0 or index == 0:
            if progressFile is not None:
                progressFile.close()
                chunk += 1
            progPath = '/'.join(
                ['chunk' + str(chunk).zfill(2), 'progress.txt'])
            progressFile = open(progPath, 'ar+')
        oneLine = " ".join(
            [oneFile,
             str(shared_pipe.PROGRESS_STATES['NOT_STARTED']), "\n"])
        progressFile.write(oneLine)

    progressFile.close()  #close the last progress file that is written.
    print "this many files in the list: " + str(len(fList))
    print "chunk size: " + str(chunk_size)
예제 #10
0
def checkAllDirs():
    shared_pipe.init()

    #number of job dirs per condor submit file.
    submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \
                 (shared_pipe.SETTINGS['n_submit_files'])
    completedDirs = 0

    if os.path.isfile('progress.txt'):
        print "Removing exinsting overall progress file to rebuild it.."
        os.remove('progress.txt')
    overallProgressFile = open('progress.txt', 'ar+')

    overallSummary = getEmptyCounts()

    for i in range(0, shared_pipe.SETTINGS['chunk_count']):

        dirName = 'chunk' + str(i).zfill(2)
        dirResults = checkOneDirectory(dirName)

        #Does the following line effectively replace the block of code that follows?
        #print "right before the call to update_overall_counts" + str(overallSummary)
        overallSummary = update_overall_counts(overallSummary,
                                               dirResults['summary'])
        print "summary count ; " + str(dirResults['summary']['counted'])

        #This is where all of the special stats get added.
        overallProgressFile.writelines(dirResults['fileLines'])

        if checkForCompleteDir(dirResults['summary']):
            completedDirs += 1

    completedChunks = completedDirs / submitSize
    print 'completed counting ' + str(completedDirs) + " out of " + \
          str(shared_pipe.SETTINGS['chunk_count']) + " directories"
    print "completed counting " + str(completedChunks) + " out of " +\
          str(shared_pipe.SETTINGS['n_submit_files']) + " submit files."
    #pretty sure that we should use the submit file called 'completedChunks' next.
    overallProgressFile.write('summary for all active jobs:')
    overallProgressFile.write(str(overallSummary))
    overallProgressFile.close()
예제 #11
0
def checkAllDirs():
    shared_pipe.init()

    #number of job dirs per condor submit file.
    submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \
                 (shared_pipe.SETTINGS['n_submit_files'])
    completedDirs = 0

    if os.path.isfile('progress.txt'):
        print "Removing exinsting overall progress file to rebuild it.."
        os.remove('progress.txt')
    overallProgressFile = open('progress.txt', 'ar+')
    overallSummary = {'COMPLETE': 0, 'IN_PROGRESS': 0, 'NOT_STARTED': 0}

    for i in range(0, shared_pipe.SETTINGS['chunk_count']):

        dirName = 'chunk' + str(i).zfill(2)
        dirResults = checkOneDirectory(dirName)

        overallSummary['COMPLETE'] += dirResults['summary']['COMPLETE']
        overallSummary['IN_PROGRESS'] += dirResults['summary']['IN_PROGRESS']
        overallSummary['NOT_STARTED'] += dirResults['summary']['NOT_STARTED']
        overallProgressFile.writelines(dirResults['fileLines'])

        if checkForCompleteDir(dirResults['summary']):
            completedDirs += 1

    completedChunks = completedDirs / submitSize
    print 'completed ' + str(completedDirs) + " out of " + \
          str(shared_pipe.SETTINGS['chunk_count']) + " directories"
    print "completed " + str(completedChunks) + " out of " +\
          str(shared_pipe.SETTINGS['n_submit_files']) + " submit files."
    #pretty sure that we should use the submit file called 'completedChunks' next.
    overallProgressFile.write('summary for all active jobs:')
    overallProgressFile.write(str(overallSummary))
    overallProgressFile.close()
def checkAllDirs():
    shared_pipe.init() 

    #number of job dirs per condor submit file.
    submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \
                 (shared_pipe.SETTINGS['n_submit_files'])
    completedDirs = 0

    if os.path.isfile('progress.txt'): 
        print "Removing exinsting overall progress file to rebuild it.."
        os.remove('progress.txt')
    overallProgressFile = open('progress.txt', 'ar+')
    overallSummary = { 'COMPLETE' : 0, 'IN_PROGRESS' : 0, 'NOT_STARTED' : 0 } 

    for i in range(0, shared_pipe.SETTINGS['chunk_count']):

        dirName = 'chunk' + str(i).zfill(2)
        dirResults = checkOneDirectory(dirName)

        overallSummary['COMPLETE'] += dirResults['summary']['COMPLETE'] 
        overallSummary['IN_PROGRESS'] += dirResults['summary']['IN_PROGRESS'] 
        overallSummary['NOT_STARTED'] += dirResults['summary']['NOT_STARTED']     
        overallProgressFile.writelines(dirResults['fileLines'])

        if checkForCompleteDir(dirResults['summary']):
            completedDirs += 1 

    completedChunks = completedDirs / submitSize
    print 'completed ' + str(completedDirs) + " out of " + \
          str(shared_pipe.SETTINGS['chunk_count']) + " directories"
    print "completed " + str(completedChunks) + " out of " +\
          str(shared_pipe.SETTINGS['n_submit_files']) + " submit files."
    #pretty sure that we should use the submit file called 'completedChunks' next.
    overallProgressFile.write('summary for all active jobs:')
    overallProgressFile.write(str(overallSummary))
    overallProgressFile.close()
import sqlite3
from elasticsearch import Elasticsearch, helpers, ConnectionError
import elasticsearch
import time
import json
import sys
import requests
import os
import zlib
import math
import pickle

import shared_pipe

shared_pipe.init()
pStates = shared_pipe.PROGRESS_STATES
cutoffs = shared_pipe.PVALUE_CUTOFFS
use_cutoffs = shared_pipe.RESTRICT_BY_PVALUE


#SHOULD NOT CHANGE:
sqlite_table_name = 'scores_data'

"""
A script to import sqlite3 tables into Elasticsearch
To use: give it one argument: the name of the directory to find the input files
in with no trailling space.
 then (in the shared_pipe settings file) change DRY_RUN to False.
"""
DRY_RUN = shared_pipe.DRY_RUN #False 
def setup_initial_counts():
    shared_pipe.init()
    empty_dict = dict.fromkeys(shared_pipe.NAMED_CUTOFFS.keys(), 0)
    empty_dict.update({'counted': 0}) 
    return empty_dict
예제 #15
0
import os, grp, math
import shutil
import os.path
import re
import shared_pipe
from shared_pipe import WhichTFLibs, print_with_color, print_error
import sys
from subprocess import call
import subprocess
# What does this file do? 
# Breaks up the whole colletion of files into N directories.
# Creating a seperate condor submit file for each batch of rangeSize directories
shared_pipe.init()

#The following code is used to ensure that directories created by this 
#script can be deleted by other users in the group 'atsnp'.
#(Thanks to William Annis at biostat sysreq for the following code.)
def get_effective_group():
    eguid = os.getegid()
    return grp.getgrgid(eguid).gr_name

if get_effective_group() != 'atsnp':
    msg = "Configure permissions by running this command: './setup_shell.sh'" +\
          "\n(Your user is supposed to have the primary group 'atsnp' active.)"
    print_with_color(msg)
    sys.exit(1)


def get_file_list(path):
   file_pattern = shared_pipe.FILE_PATTERN
   fileCount = 0