def mP(a, tab, Aligner, sc): manager = multiprocessing.Manager() alignments = manager.list() cores = int(input('Inserisci il numero di processori: ')) if cores > multiprocessing.cpu_count(): cores = multiprocessing.cpu_count() print("Superato il numero massimo di processori,", str(cores), "in uso") else: print(str(cores), "processori in uso") processes = [] data = ReadFile.SPARKreadFile(sc) dict = [x["SEQ"] for x in data.rdd.collect()] #dict = ReadFile.HengLireadFile() #Heng Li chunk_size = len(dict) / cores slices = Chunks(dict, math.ceil(chunk_size)) for i, s in enumerate(slices): procname = 'processor' + str(i) p = multiprocessing.Process(target=Alignment.mPalignment, args=(a, tab, Aligner, s, alignments, procname)) p.start() processes.append(p) for p in processes: p.join() DF = spark.createDataFrame(alignments) DataFrame = DF.join(data, on=['seq'], how='inner') return DataFrame
from pyspark.shell import sqlContext import Alignment import createBam import MultiProcess from pyspark import SparkContext, SparkConf from timeit import default_timer as timer import os import ReadFile import SparkAligner import Aligner import pickle import HashTable from datetime import datetime sc = SparkContext.getOrCreate() data = ReadFile.SPARKreadFile(sc) dict = [x["SEQ"] for x in data.rdd.collect()] #CODICE CON MAPPY =================================================================================================================== a = mp.Aligner("reference.fa", preset="map-ont") alignmentsS = [] # alignmentsH = [] tab = str.maketrans('ACTG', 'TGAC') AlignerS = namedtuple('SEQ', [ 'contig', 'flag', 'seq', 'pos', 'mapq', 'cigar', 'is_primary', 'MDtag', 'cstag' ]) #SPARK # AlignerH = namedtuple('SEQ', ['contig', 'Rname', 'flag', 'pos', 'mapq', 'cigar', 'seq', 'is_primary', 'MDtag', 'cstag','basequal']) #Heng Li # startMP = timer() # DataFrameMP = MultiProcess.mP(a, tab, AlignerS, sc) #MULTIPROCESSORE SPARK