Exemplo n.º 1
0
 def _method_GATB(self, *args, **kwargs):
     with open(self.outfile, "w") as fasta:
         for record in Bank(self.infile):
             fasta.write(">{}\n{}\n".format(
                 record.comment.decode("utf-8"),
                 record.sequence.decode("utf-8")))
     print("test")
    indices = {}
    for index in parameters:
        if parameters[index]['from_end']:
            indices[index] = len(sequence) - parameters[index]['index']
        else: 
            indices[index] = parameters[index]['index']
    return(indices)

r = []
print("Reading source file into in-memory database of barcodes: {}".format(snakemake.input['fastqfile']))
con = sqlite3.connect(":memory:")
cur = con.cursor()
print("")
cur.execute('CREATE TABLE barcodes (seqid TEXT, celbc TEXT, umi TEXT)')
# reading from file and writing to database in chunks to save memory
fastq_parser = Bank(snakemake.input.fastqfile[0])
for chunk in chunks(fastq_parser, 10000):
    r = []
    for seq in chunk:
        sequence = seq.sequence.decode("utf-8")
        indices = parse_indices(sequence, config['params']['barcoding'])
        umi = sequence[indices["umi_start"]:indices["umi_end"]]
        cel = sequence[indices["cell_bc_start"]:indices["cell_bc_end"]]
        seqid = seq.comment.decode("utf-8").split(" ")[0]
        r.append((seqid, cel, umi))
    cur.executemany('INSERT INTO barcodes VALUES (?,?,?)', r)
r = None
print("Creating index on read indentifiers")
cur.execute('CREATE UNIQUE INDEX seqidx ON barcodes (seqid)')

print("Writing output file: {}".format(snakemake.output))
Exemplo n.º 3
0
    type=str,
    help='<start>:<end> indices of reads to process. For debugging/batching.',
    default=None)

args = parser.parse_args()

graph_file = args.graph_file
reads_file = args.reads_file
out_file = args.out_file

process_interval = None
if args.reads_to_process is not None:
    begin, end = args.reads_to_process.split(':')
    process_interval = int(begin), int(end)

bank = Bank(reads_file)
print("File '%s' is of type: %s" % (bank.uri, bank.type))

graph = Graph('-in %s' % graph_file)
KMER_SIZE = graph.kmerSize
expected_seed_ratio = 0.8**KMER_SIZE

nseqs = 0

# pickle.dump(minhash, open('minhash', 'wb'))
# bktree = pickle.load(open('BKTREE_19', 'rb'))
print("Got the nodes")

total_path_found = 0
total_read_length = 0
Exemplo n.º 4
0
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ===========================================================================

# we import pyGATB Bank
from gatb import Bank

# We will use a file containing some Fasta sequences
F_NAME = '../thirdparty/gatb-core/gatb-core/test/db/query.fa'

# We create the bank representation of the Fasta sequence file
bank = Bank(F_NAME)

print("File '%s' is of type: %s" % (bank.uri, bank.type))

nseqs = 0

# We iterate over some sequences.
for i, seq in enumerate(bank):
    # 'seq' is of type 'Sequence'.
    # Accessing 'Sequence' internals is done as follows:
    #   sequence header : seq.comment
    #   sequence quality: seq.quality (Fastq only)
    #   sequence letters: seq.sequence
    #   sequence size   : len(seq)
    seqid = seq.comment.decode("utf-8").split(" ")[0]
    if i < 5: