def allCountsForOneFileAsString(allCounts): shared_pipe.init() build_line = "" counts = ['rdata', 'counted'] counts.extend(shared_pipe.NAMED_CUTOFFS.keys()) for one_count in counts: build_line += " " + one_count + " :" + str(allCounts[one_count]) return build_line
def getEmptyCounts(): shared_pipe.init() #The status 'es_skipped' should ALWAYS indicate that a record with that #ID was already present in the Elasticsearch index. #cutoff total is the number of records that meets the pvalue cutoff. #if pvalue cutoffs are disable, this number will be 0. counts = ['COMPLETE', 'IN_PROGRESS', 'NOT_STARTED', 'rdata', 'cutoff_total', 'es_added', 'es_skipped', 'other'] return dict.fromkeys(counts, 0)
def readOneProgressFile(progFilePath): shared_pipe.init() summary = getEmptyCounts() progFile = open(progFilePath, 'r') fileLines = [] for line in progFile: fileLines.append(line) terms = line.split() summary.update(addCountsFromOneLine(terms, summary)) progFile.close() return { "summary" : summary, "fileLines" : fileLines }
def checkAllDirs(parent): shared_pipe.init() #number of job dirs per condor submit file. submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \ (shared_pipe.SETTINGS['n_submit_files']) completedDirs = 0 overallProgressFile = open('progress.txt', 'ar+') #should be regenerated once per status check run, regardless of which #data sets are being checked. overallSummary = getEmptyCounts() k = 0; completeDirsForBatch = 0 #k counts directories in a batch. whichBatch = 0 batches = setup_list_of_submit_files(shared_pipe.SETTINGS['n_submit_files']) for i in range(0, shared_pipe.SETTINGS['chunk_count']): dirName = '/'.join([parent, 'chunk' + str(i).zfill(2)]) dirResults = checkOneDirectory(dirName) overallSummary = update_overall_counts(overallSummary, dirResults['summary']) #writeToFileAndPrint(dirResults['fileLines'], overallProgressFile) #overallProgressFile.writelines(dirResults['fileLines']) k += 1 #This will get reset once per batch. if checkForCompleteDir(dirResults['summary']): completedDirs += 1 completeDirsForBatch += 1 #End of batch turn over all per-batch calculations. if k == submitSize: if completeDirsForBatch == submitSize: batches[whichBatch] = True k = 0; completeDirsForBatch = 0; whichBatch += 1 how_many_complete_batches = show_which_batches_are_done(batches) writeToFileAndPrint( "\tcompleted " + str(completedDirs) + " out of " + \ str(shared_pipe.SETTINGS['chunk_count']) + " directories.", \ overallProgressFile) writeToFileAndPrint("\tcompleted " + str(how_many_complete_batches) + " out of " +\ str(shared_pipe.SETTINGS['n_submit_files']) + " submit files.", \ overallProgressFile) #pretty sure that we should use the submit file called 'completedChunks' next. writeToFileAndPrint(' summary for all active jobs:' , overallProgressFile) writeToFileAndPrint(str(overallSummary) , overallProgressFile) overallProgressFile.close()
def setupJobDirs(input_path): shared_pipe.init() filesToCopy = ['multi_pipeline.py', 'rdata2sqlite.R', 'count_records_matching_cutoffs.py', 'shared_pipe.py'] for i in range(0, shared_pipe.SETTINGS['chunk_count']): jobDir = 'chunk' + str(i).zfill(2) if not os.path.exists(jobDir): os.makedirs(jobDir) else: print "Directory : " + jobDir + " already exists." print "delete all chunk* directories to use this script." exit(1) for oneFile in filesToCopy: shutil.copyfile(oneFile, jobDir + "/" + oneFile) fList = get_file_list(input_path) chunk_size = float(len(fList)) / \ float((shared_pipe.SETTINGS['chunk_count'])) chunk_size = math.ceil(chunk_size) print "chunk count = " + str(shared_pipe.SETTINGS['chunk_count']) print "this many files in the list: " + str(len(fList)) print "chunk size: " + str(chunk_size) progressFile = None chunk = 0 #should list files 0 to N-1. Not 1 to N for index, oneFile in enumerate(fList): #print "index = " + str(index) if index % chunk_size == 0 or index == 0: if progressFile is not None: progressFile.close() chunk += 1 progPath = 'chunk' + str(chunk).zfill(2) + '/progress.txt' print "setting up a new progress file: " + progPath progressFile = open(progPath, 'ar+') oneLine = " ".join([oneFile, str(shared_pipe.PROGRESS_STATES['NOT_STARTED']), "\n"]) progressFile.write(oneLine) if progressFile is not None: progressFile.close()
def readOneProgressFile(progFilePath): shared_pipe.init() summary = { 'COMPLETE' : 0, 'IN_PROGRESS' : 0, 'NOT_STARTED' : 0 } progFile = open(progFilePath, 'r') fileLines = [] for line in progFile: fileLines.append(line) terms = line.split() if int(terms[1]) == shared_pipe.PROGRESS_STATES['COMPLETE']: summary['COMPLETE'] += 1 elif int(terms[1]) == shared_pipe.PROGRESS_STATES['IN_PROGRESS']: summary['IN_PROGRESS'] += 1 elif int(terms[1]) == shared_pipe.PROGRESS_STATES['NOT_STARTED']: summary['NOT_STARTED'] += 1 progFile.close() return { "summary" : summary, "fileLines" : fileLines }
def addCountsFromOneLine(terms, summary): shared_pipe.init() #can this be factored to 1 call at the top of the file? pgs = dict(zip(shared_pipe.PROGRESS_STATES.values(), shared_pipe.PROGRESS_STATES.keys() ) ) status = pgs[int(terms[1])] summary[status] += 1 if status == 'COMPLETE': specific_counts = terms[2:] #print "specific counts: " + repr(specific_counts) i = 0 while i < len(specific_counts): value_called = specific_counts[i] #print "value called " + value_called value = int(specific_counts[i+1].replace(':', '')) #print "value : " + str(value) summary[value_called] += value i += 2 return summary
def setupJobDirs(input_path): shared_pipe.init() filesToCopy = ['snp_info_pipeline.py', 'process_snpinfo.R', 'put_snp_info_in_es.py', 'shared_pipe.py'] for i in range(0, shared_pipe.SETTINGS['chunk_count']): print "setting up directory for " + str(i) jobDir = 'chunk' + str(i).zfill(2) if not os.path.exists(jobDir): os.makedirs(jobDir) else: print "Directory : " + jobDir + " already exists." print "delete all chunk* directories to use this script." exit(1) for oneFile in filesToCopy: shutil.copyfile(oneFile, jobDir + "/" + oneFile) fList = get_file_list(input_path) print "fList :" + repr(fList) chunk_size = len(fList) / \ (shared_pipe.SETTINGS['chunk_count'] - 1) progressFile = None chunk = 0 for index, oneFile in enumerate(fList): if index % chunk_size == 0 or index == 0: if progressFile is not None: progressFile.close() chunk += 1 progPath = '/'.join(['chunk' + str(chunk).zfill(2), 'progress.txt']) progressFile = open(progPath, 'ar+') oneLine = " ".join([oneFile, str(shared_pipe.PROGRESS_STATES['NOT_STARTED']), "\n"]) progressFile.write(oneLine) progressFile.close() #close the last progress file that is written. print "this many files in the list: " + str(len(fList)) print "chunk size: " + str(chunk_size)
def setupJobDirs(input_path): shared_pipe.init() filesToCopy = [ 'snp_info_pipeline.py', 'process_snpinfo.R', 'put_snp_info_in_es.py', 'shared_pipe.py' ] for i in range(0, shared_pipe.SETTINGS['chunk_count']): print "setting up directory for " + str(i) jobDir = 'chunk' + str(i).zfill(2) if not os.path.exists(jobDir): os.makedirs(jobDir) else: print "Directory : " + jobDir + " already exists." print "delete all chunk* directories to use this script." exit(1) for oneFile in filesToCopy: shutil.copyfile(oneFile, jobDir + "/" + oneFile) fList = get_file_list(input_path) print "fList :" + repr(fList) chunk_size = len(fList) / \ (shared_pipe.SETTINGS['chunk_count'] - 1) progressFile = None chunk = 0 for index, oneFile in enumerate(fList): if index % chunk_size == 0 or index == 0: if progressFile is not None: progressFile.close() chunk += 1 progPath = '/'.join( ['chunk' + str(chunk).zfill(2), 'progress.txt']) progressFile = open(progPath, 'ar+') oneLine = " ".join( [oneFile, str(shared_pipe.PROGRESS_STATES['NOT_STARTED']), "\n"]) progressFile.write(oneLine) progressFile.close() #close the last progress file that is written. print "this many files in the list: " + str(len(fList)) print "chunk size: " + str(chunk_size)
def checkAllDirs(): shared_pipe.init() #number of job dirs per condor submit file. submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \ (shared_pipe.SETTINGS['n_submit_files']) completedDirs = 0 if os.path.isfile('progress.txt'): print "Removing exinsting overall progress file to rebuild it.." os.remove('progress.txt') overallProgressFile = open('progress.txt', 'ar+') overallSummary = getEmptyCounts() for i in range(0, shared_pipe.SETTINGS['chunk_count']): dirName = 'chunk' + str(i).zfill(2) dirResults = checkOneDirectory(dirName) #Does the following line effectively replace the block of code that follows? #print "right before the call to update_overall_counts" + str(overallSummary) overallSummary = update_overall_counts(overallSummary, dirResults['summary']) print "summary count ; " + str(dirResults['summary']['counted']) #This is where all of the special stats get added. overallProgressFile.writelines(dirResults['fileLines']) if checkForCompleteDir(dirResults['summary']): completedDirs += 1 completedChunks = completedDirs / submitSize print 'completed counting ' + str(completedDirs) + " out of " + \ str(shared_pipe.SETTINGS['chunk_count']) + " directories" print "completed counting " + str(completedChunks) + " out of " +\ str(shared_pipe.SETTINGS['n_submit_files']) + " submit files." #pretty sure that we should use the submit file called 'completedChunks' next. overallProgressFile.write('summary for all active jobs:') overallProgressFile.write(str(overallSummary)) overallProgressFile.close()
def checkAllDirs(): shared_pipe.init() #number of job dirs per condor submit file. submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \ (shared_pipe.SETTINGS['n_submit_files']) completedDirs = 0 if os.path.isfile('progress.txt'): print "Removing exinsting overall progress file to rebuild it.." os.remove('progress.txt') overallProgressFile = open('progress.txt', 'ar+') overallSummary = {'COMPLETE': 0, 'IN_PROGRESS': 0, 'NOT_STARTED': 0} for i in range(0, shared_pipe.SETTINGS['chunk_count']): dirName = 'chunk' + str(i).zfill(2) dirResults = checkOneDirectory(dirName) overallSummary['COMPLETE'] += dirResults['summary']['COMPLETE'] overallSummary['IN_PROGRESS'] += dirResults['summary']['IN_PROGRESS'] overallSummary['NOT_STARTED'] += dirResults['summary']['NOT_STARTED'] overallProgressFile.writelines(dirResults['fileLines']) if checkForCompleteDir(dirResults['summary']): completedDirs += 1 completedChunks = completedDirs / submitSize print 'completed ' + str(completedDirs) + " out of " + \ str(shared_pipe.SETTINGS['chunk_count']) + " directories" print "completed " + str(completedChunks) + " out of " +\ str(shared_pipe.SETTINGS['n_submit_files']) + " submit files." #pretty sure that we should use the submit file called 'completedChunks' next. overallProgressFile.write('summary for all active jobs:') overallProgressFile.write(str(overallSummary)) overallProgressFile.close()
def checkAllDirs(): shared_pipe.init() #number of job dirs per condor submit file. submitSize = (shared_pipe.SETTINGS['chunk_count'] )/ \ (shared_pipe.SETTINGS['n_submit_files']) completedDirs = 0 if os.path.isfile('progress.txt'): print "Removing exinsting overall progress file to rebuild it.." os.remove('progress.txt') overallProgressFile = open('progress.txt', 'ar+') overallSummary = { 'COMPLETE' : 0, 'IN_PROGRESS' : 0, 'NOT_STARTED' : 0 } for i in range(0, shared_pipe.SETTINGS['chunk_count']): dirName = 'chunk' + str(i).zfill(2) dirResults = checkOneDirectory(dirName) overallSummary['COMPLETE'] += dirResults['summary']['COMPLETE'] overallSummary['IN_PROGRESS'] += dirResults['summary']['IN_PROGRESS'] overallSummary['NOT_STARTED'] += dirResults['summary']['NOT_STARTED'] overallProgressFile.writelines(dirResults['fileLines']) if checkForCompleteDir(dirResults['summary']): completedDirs += 1 completedChunks = completedDirs / submitSize print 'completed ' + str(completedDirs) + " out of " + \ str(shared_pipe.SETTINGS['chunk_count']) + " directories" print "completed " + str(completedChunks) + " out of " +\ str(shared_pipe.SETTINGS['n_submit_files']) + " submit files." #pretty sure that we should use the submit file called 'completedChunks' next. overallProgressFile.write('summary for all active jobs:') overallProgressFile.write(str(overallSummary)) overallProgressFile.close()
import sqlite3 from elasticsearch import Elasticsearch, helpers, ConnectionError import elasticsearch import time import json import sys import requests import os import zlib import math import pickle import shared_pipe shared_pipe.init() pStates = shared_pipe.PROGRESS_STATES cutoffs = shared_pipe.PVALUE_CUTOFFS use_cutoffs = shared_pipe.RESTRICT_BY_PVALUE #SHOULD NOT CHANGE: sqlite_table_name = 'scores_data' """ A script to import sqlite3 tables into Elasticsearch To use: give it one argument: the name of the directory to find the input files in with no trailling space. then (in the shared_pipe settings file) change DRY_RUN to False. """ DRY_RUN = shared_pipe.DRY_RUN #False
def setup_initial_counts(): shared_pipe.init() empty_dict = dict.fromkeys(shared_pipe.NAMED_CUTOFFS.keys(), 0) empty_dict.update({'counted': 0}) return empty_dict
import os, grp, math import shutil import os.path import re import shared_pipe from shared_pipe import WhichTFLibs, print_with_color, print_error import sys from subprocess import call import subprocess # What does this file do? # Breaks up the whole colletion of files into N directories. # Creating a seperate condor submit file for each batch of rangeSize directories shared_pipe.init() #The following code is used to ensure that directories created by this #script can be deleted by other users in the group 'atsnp'. #(Thanks to William Annis at biostat sysreq for the following code.) def get_effective_group(): eguid = os.getegid() return grp.getgrgid(eguid).gr_name if get_effective_group() != 'atsnp': msg = "Configure permissions by running this command: './setup_shell.sh'" +\ "\n(Your user is supposed to have the primary group 'atsnp' active.)" print_with_color(msg) sys.exit(1) def get_file_list(path): file_pattern = shared_pipe.FILE_PATTERN fileCount = 0