Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description='extract immune receptor sequences from Genbank records',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # directory to store the batches in
    parser.add_argument('batch_dirname',
                        metavar='dir',
                        help='name for the batch directory')
    # input files
    parser.add_argument('genbank_filename',
                        metavar='genbank-file',
                        help='the file with the Genbank records')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    # check if the batch directory already exists
    if os.path.exists(args.batch_dirname):
        logging.error('batch direcotry %s already exists', args.batch_dirname)
        return 10
    else:
        os.mkdir(args.batch_dirname)

    data = {}
    record_counts = 0

    # read in the Genbank records
    with open_compressed(args.genbank_filename, 'rt') as genbank_handle:
        for record in SeqIO.parse(genbank_handle, 'genbank'):
            record_counts += 1

            # store the data by year and key
            key, year = calculate_key_year(record)
            if year not in data:
                data[year] = {}
            if key not in data[year]:
                data[year][key] = []
            data[year][key].append(record)

    # make a directory for each year
    for year in data:
        os.mkdir('%s/%04d' % (args.batch_dirname, year))

        # for each key, make a file with the Genbank records
        for key in data[year]:
            if type(key) is int:
                filename = '%s/%04d/pmid_%d.genbank' % (args.batch_dirname,
                                                        year, key)
            else:
                filename = '%s/%04d/hash_%d.genbank' % (args.batch_dirname,
                                                        year, abs(hash(key)))
            with open(filename, 'wt') as handle:
                SeqIO.write(data[year][key], handle, 'genbank')

    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 2
0
def deduce_schema(filename, examine_records=1000):
    with open_compressed(filename, 'rb') as file_handle:
        reader = fastavro.reader(file_handle)

        # read a batch of records, size examine_records, to figure out what columns to make
        parses_ids = set()
        lineages_ids = set()
        for record in itertools.islice(reader, examine_records):
            if 'parses' in record:
                parses_ids.update(record['parses'].keys())
            if 'lineages' in record:
                lineages_ids.update(record['lineages'].keys())

        schema = pa.schema([
            pa.field('subject', pa.string(), nullable=False),
            pa.field('sample', pa.string(), nullable=True),
            pa.field('source', pa.string(), nullable=False),
            pa.field('name', pa.string(), nullable=False),
            pa.field('sequence', pa.binary(), nullable=False)
        ] + [
            pa.field('parse_' + p, pa.binary(), nullable=True)
            for p in sorted(parses_ids)
        ] + [
            pa.field('lineage_' + p, pa.string(), nullable=True)
            for p in sorted(lineages_ids)
        ])

        return schema, parses_ids, lineages_ids
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='add the given subject to an Avro file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('seq_record_filename',
                        metavar='seq_record.avro',
                        help='the Avro file with the sequence records')
    parser.add_argument('subject', metavar='S', help='the subject to set')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    with open_compressed(args.seq_record_filename, 'rb') as seq_record_handle:
        seq_record_reader = fastavro.reader(seq_record_handle)

        fastavro.writer(sys.stdout.buffer,
                        seq_record_reader.writer_schema,
                        subject_adder(seq_record_reader, args.subject),
                        codec='bzip2')

    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description='subset Genbank files bases on organism and length',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # files
    parser.add_argument('genbank_filenames', metavar='genbank_file', nargs='+', help='the file with the Genbank records')
    parser.add_argument('--organism', '-o', metavar='O', type=str, default='H**o sapiens', help='only process records with the given organism')
    parser.add_argument('--max-size', '-m', metavar='M', type=int, default=50000, help='ignore sequences longer than this')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    processed_record_count = 0

    for genbank_filename in args.genbank_filenames:
        logging.info('processing %s', genbank_filename)
        with open_compressed(genbank_filename, 'rt') as genbank_file:
            records = SeqIO.parse(genbank_file, 'genbank')
            filtered_records = genbank_filter(records,
                                              organism=args.organism,
                                              max_size=args.max_size)
            SeqIO.write(filtered_records, sys.stdout, 'genbank')

    elapsed_time = time.time() - start_time
    logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='convert a sequence records in Avro file format to FASTA or FASTQ')
    # input files
    parser.add_argument('seq_record_filenames', metavar='seq_record.avro', nargs='+', help='the Avro file with the sequence records')
    # options
    output_format = parser.add_mutually_exclusive_group()
    output_format.add_argument('--fasta', '-a', default=True, action='store_true', help='output a FASTA file')
    output_format.add_argument('--fastq', '-q', action='store_false', dest='fasta', help='output a FASTQ file')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    for input_filename in args.seq_record_filenames:
        with open_compressed(input_filename, 'rb') as seq_record_handle:
            seq_record_reader = fastavro.reader(seq_record_handle)

            for record in seq_record_reader:
                if args.fasta:
                    print('>%s\n%s' % (record['name'], record['sequence']['sequence']))
                else:
                    print('@%s\n%s\n+\n%s' % (record['name'], record['sequence']['sequence'], record['sequence']['qual']))

    elapsed_time = time.time() - start_time
    logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description='', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('parse_label',
                        metavar='label',
                        help='the parse label to use for the parse')
    parser.add_argument('filenames',
                        metavar='file',
                        nargs='+',
                        help='the Avro file to read')
    args = parser.parse_args()

    print('accession', 'description', 'v_name', 'd_name', 'j_name', sep='\t')

    for filename in args.filenames:
        with open_compressed(filename, 'rb') as read_handle:
            reader = fastavro.reader(read_handle)

            for record in reader:
                name = record['name']
                assert name.startswith('genbank:')
                accession = name.split(':')[1]

                parse = record['parses'][args.parse_label]
                v_name, _, d_name, _, j_name, _ = best_vdj_score(parse)
                description = None
                if 'description' in record['sequence']['annotations']:
                    description = record['sequence']['annotations'][
                        'description']

                print(accession, description, v_name, d_name, j_name, sep='\t')
Exemplo n.º 7
0
def igblast_chain(igblast_filenames):
    for filename in igblast_filenames:
        with open_compressed(filename, 'rt') as igblast_handle:
            logging.info('processing parsed in %s', filename)
            igblast_parse_reader = IgBLASTParser(igblast_handle)
            for record in igblast_parse_reader:
                yield record
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'extract the read names and alignment scores from a SAM file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('sam_filenames',
                        metavar='file.sam',
                        nargs='*',
                        default=['-'],
                        help='the SAM files to process')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    record_template = {'pair_id': None, 'align_score': None}
    writer = csv.DictWriter(sys.stdout, fieldnames=record_template.keys())
    writer.writeheader()

    for sam_filename in args.sam_filenames:
        with open_compressed(sam_filename, 'rt') as sam_file_handle:
            for ident, score in basic_sam_parser_match(sam_file_handle):
                record = record_template.copy()
                record['pair_id'] = ident
                record['align_score'] = score
                writer.writerow(record)

    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'get the counts of subject and sources from sequence records objects',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('seq_record_filename',
                        metavar='seq_record.avro',
                        nargs='*',
                        help='the Avro file with the sequence records')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    subject_source_counts = Counter()

    if len(args.seq_record_filename) == 0:
        seq_record_filenames = ['-']
    else:
        seq_record_filenames = args.seq_record_filename

    for record_filename in seq_record_filenames:
        logging.info('processing file %s', record_filename)
        with open_compressed(record_filename, 'rb') as seq_record_handle:
            for record in fastavro.reader(seq_record_handle):
                subject = record['subject']
                source = record['source']
                subject_source_counts[(subject, source)] += 1

    for subject, source in subject_source_counts:
        print(subject,
              source,
              subject_source_counts[(subject, source)],
              sep='\t')
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(description='generate barcode and primer informations for FASTQ read pairs', 
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # what to put into the source field
    parser.add_argument('source', metavar='source', help='what to put into the source field')
    # the bacode map
    parser.add_argument('barcode_map_filename',  metavar='barcode_map.csv', help='CSV file with the barcode map')
    # file with the barcodes and targeting
    parser.add_argument('barcodes_targets_filenames', metavar='idents.csv', nargs='+', help='CSV file with the barcodes and targets')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)

    logging.info('loading barcode map')
    barcode_map = {}
    with open(args.barcode_map_filename, 'r') as map_handle:
        for row in csv.DictReader(map_handle):
            assert args.source == row['run_label']
            key = (row['barcode1'], row['target1'], row['barcode2'], row['target2'])
            assert key not in barcode_map # make sure there are no duplcate rows
            barcode_map[key] = (row['participant_label'], row['replicate_label'])
    logging.info('loaded %d entries', len(barcode_map))

    for barcodes_targets_filename in args.barcodes_targets_filenames:
        logging.info('processing file %s', barcodes_targets_filename)
        with open_compressed(barcodes_targets_filename, 'rt') as barcode_targets_handle:
            for record in csv.DictReader(barcode_targets_handle):
                key = (record['barcode1:name'], record['target1:name'], record['barcode2:name'], record['target2:name'])
                if key not in barcode_map:  # if found, annotated with subject, sample
                    print(*key, sep=',')
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description='validate sequence records from Avro files',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('repertoire_filenames',
                        metavar='repertoire-file',
                        nargs=3,
                        help='the V(D)J repertoire file used in IgBLAST')
    parser.add_argument('sequence_record_filenames',
                        metavar='seq_record.avro',
                        nargs='+',
                        help='Avro files with the sequence records to test')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    v_repertoire = {
        r.id: str(r.seq)
        for r in SeqIO.parse(args.repertoire_filenames[0], 'fasta')
    }
    d_repertoire = {
        r.id: str(r.seq)
        for r in SeqIO.parse(args.repertoire_filenames[1], 'fasta')
    }
    j_repertoire = {
        r.id: str(r.seq)
        for r in SeqIO.parse(args.repertoire_filenames[2], 'fasta')
    }

    record_count = 0

    error = False
    for filename in args.sequence_record_filenames:
        logging.info('processing file %s', filename)
        if error:
            break
        with open_compressed(filename, 'rb') as input_handle:
            for record in fastavro.reader(input_handle):
                if not tests.test_parse_alignment_structure(record):
                    pprint(record)
                    error = True
                if not tests.test_parse_alignment_sequences(
                        record, v_repertoire, d_repertoire, j_repertoire):
                    pprint(record)
                    error = True

                if error:
                    break

                record_count += 1

    logging.info('processed %d sequence records', record_count)
    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 12
0
def avro_2nd_field_missable_iterator(filename, fieldname1, fieldname2):
    with open_compressed(filename, 'rb') as file_handle:
        reader = fastavro.reader(file_handle)
        for record in reader:
            if fieldname2 in record[fieldname1]:
                yield record[fieldname1][fieldname2]
            else:
                yield None
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description='generate barcode and primer informations for FASTQ read pairs', 
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # what to put into the source field
    parser.add_argument('source', metavar='source', help='what to put into the source field')
    # the bacode map
    parser.add_argument('barcode_map_filename',  metavar='barcode_map.csv', help='CSV file with the barcode map')
    # file with the barcodes and targeting
    parser.add_argument('barcodes_targets_filename', metavar='idents.csv', help='CSV file with the barcodes and targets')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    logging.info('loading barcode map')
    barcode_map = {}
    with open(args.barcode_map_filename, 'r') as map_handle:
        for row in csv.DictReader(map_handle):
            assert args.source == row['run_label']
            key = (row['barcode1'], row['target1'], row['barcode2'], row['target2'])
            assert key not in barcode_map # make sure there are no duplcate rows
            barcode_map[key] = (row['participant_label'], row['replicate_label'])
    logging.info('loaded %d entries', len(barcode_map))

    demuxing_template = {'pair_id': None,
                         'source': args.source,
                         'subject': None,
                         'sample': None}
    writer = csv.DictWriter(sys.stdout, fieldnames=demuxing_template.keys())
    writer.writeheader()

    logging.info('annotating reads')
    read_pair_count = 0
    annotated_pair_count = 0
    with open_compressed(args.barcodes_targets_filename, 'rt') as barcode_targets_handle:
        for record in csv.DictReader(barcode_targets_handle):
            read_pair_count += 1

            demuxing = demuxing_template.copy()
            demuxing['pair_id'] = record['pair_id']

            # form the lookup
            key = (record['barcode1:name'], record['target1:name'], record['barcode2:name'], record['target2:name'])
            if key in barcode_map:  # if found, annotated with subject, sample
                annotated_pair_count += 1

                particiapant, sample = barcode_map[key]
                demuxing['subject'] = particiapant
                demuxing['sample']  = sample

            writer.writerow(demuxing)

    logging.info('processed %d read pairs', read_pair_count)
    logging.info('annotated %d (%f%%) read pairs', annotated_pair_count, 100.0*annotated_pair_count/read_pair_count)

    elapsed_time = time.time() - start_time
    logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'concatinate two FASTQ files, rc\'ing the second one, with a given spacer in between',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input file
    parser.add_argument('fastq1_filename',
                        metavar='file1.fq',
                        help='the R1 FASTQ file')
    parser.add_argument(
        'fastq2_filename',
        metavar='file2.fq',
        help=
        'the R2 FASTQ file to reverse complement and concatinate to the above')
    # options
    parser.add_argument('--spacer',
                        '-s',
                        type=str,
                        default='XXXXXXXX',
                        help='the spacer sequence')
    parser.add_argument('--qual-spacer',
                        '-q',
                        type=int,
                        default=0,
                        help='the qual score to assign to the spacer')

    args = parser.parse_args()

    spacer_seq = args.spacer
    spacer_qual = chr(33 + args.qual_spacer) * len(spacer_seq)

    # read the FASTQ file
    with open_compressed(args.fastq1_filename, 'rt') as input1_handle, \
         open_compressed(args.fastq2_filename, 'rt') as input2_handle:
        for r1, r2 in zip(FastqGeneralIterator(input1_handle),
                          FastqGeneralIterator(input2_handle)):
            r1_id, r1_seq, r1_qual = r1
            r2_id, r2_seq, r2_qual = r2

            r1_seq += spacer_seq + r2_seq.translate(
                _dna_complement_table)[::-1]
            r1_qual += spacer_qual + r2_qual[::-1]

            print('@%s\n%s\n+\n%s' % (r1_id, r1_seq, r1_qual))
Exemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(
        description='get the list of subjects in a set of sequence records',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('seq_record_filename',
                        metavar='seq_record.avro',
                        nargs='*',
                        help='the Avro file with the sequence records')
    # options
    parser.add_argument('-s',
                        '--sort-counts',
                        action='store_true',
                        help='sort by per-subject counts')
    parser.add_argument('-c',
                        '--show-counts',
                        action='store_true',
                        help='show the per-subjects counts')
    parser.add_argument('-n',
                        '--show-none',
                        action='store_true',
                        help='show the un-assigned records')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    subject_counts = Counter()

    if len(args.seq_record_filename) == 0:
        seq_record_filenames = ['-']
    else:
        seq_record_filenames = args.seq_record_filename

    for record_filename in seq_record_filenames:
        logging.info('processing file %s', record_filename)
        with open_compressed(record_filename, 'rb') as seq_record_handle:
            subject_counts.update(
                get_subjects(fastavro.reader(seq_record_handle)))

    if args.sort_counts:
        for s, c in subject_counts.most_common():
            if args.show_none or s is not None:
                if args.show_counts:
                    print(s, c, sep='\t')
                else:
                    print(s)
    else:
        for s in sorted(subject_counts.keys(), key=str):
            if args.show_none or s is not None:
                if args.show_counts:
                    c = subject_counts[s]
                    print(s, c, sep='\t')
                else:
                    print(s)
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        description='generate a sheet for Genbank immune receptor annotation',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input file
    parser.add_argument('genbank_filename',
                        metavar='genbank-file',
                        help='the file with the Genbank records')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    record_counts = 0

    excel_filename = args.genbank_filename
    if '.' in excel_filename:
        excel_filename = excel_filename[:excel_filename.rindex('.')]
    excel_filename += '.xlsx'

    # read in the Genbank records
    with open_compressed(args.genbank_filename, 'rt') as genbank_handle:
        # load all the records
        records = list(SeqIO.parse(genbank_handle, 'genbank'))

        # get a unique references list for all records
        references = get_master_references(records)

        # create the workbook
        workbook = xlsxwriter.Workbook(excel_filename)
        curation_worksheet = workbook.add_worksheet('Curation')
        records_worksheet = workbook.add_worksheet('Records')

        # write the references to the sheet
        current_row = write_references(workbook, curation_worksheet,
                                       references)

        current_row += 1

        # write curation annotation
        current_row = write_curation_row(workbook, curation_worksheet, records,
                                         current_row)

        #
        write_genbank_records(workbook, records_worksheet, records)

        workbook.close()

    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(description='extract sequence records from Avro files with a given subject',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('source_record_filename', metavar='seq_record.avro', nargs='+', help='Avro files with the sequence records to extract')
    parser.add_argument('dest_record_filename', metavar='target.avro', help='the destination Avro file')
    parser.add_argument('subject_label', metavar='subject', help='the subject to extract, use none for un-assigned records')
    # append
    parser.add_argument('-a', '--append', action='store_true', help='append records to an existing Avro file')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    if os.path.exists(args.dest_record_filename):
        if args.append:
            logging.info('appending to existing sequence record file %s', args.dest_record_filename)
        else:
            logging.error('destination file already exists, to append use the --apppend/-a flag to add to it, exiting')
            return 10

    if args.subject_label == 'none':
        args.subject_label = None
    logging.info('extracting records for subject %s', args.subject_label)

    # read in the first file to get the schema
    with open_compressed(args.source_record_filename[0], 'rb') as seq_record_handle:
        reader = fastavro.reader(open_compressed(args.source_record_filename[0], 'rb'))
        schema = reader.writer_schema

    # open and append to the destination file
    with open_compressed(args.dest_record_filename, 'a+b') as dest_record_handle:
        fastavro.writer(dest_record_handle, schema,
                avro_file_record_filter_iter(args.source_record_filename, args.subject_label), codec='bzip2')

    elapsed_time = time.time() - start_time
    logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
Exemplo n.º 18
0
def parse_chain(filenames, file_format, mode='rt'):
    for filename in filenames:
        with open_compressed(filename, mode) as handle:
            if file_format in [
                    'avro', 'seq_rec', 'seq_record', 'sequence_record'
            ]:
                seq_record_reader = fastavro.reader(handle)
                for record in seq_record_reader:
                    result = SeqRecord(Seq(record['sequence']['sequence']),
                                       id=record['name'],
                                       description='')
                    dict.__setitem__(result._per_letter_annotations,
                                     'phred_quality',
                                     record['sequence']['qual'])
                    yield result
            else:
                for record in SeqIO.parse(handle, file_format):
                    yield record
Exemplo n.º 19
0
def genbank_filter_chain(filenames, organism=None, max_length=None):
    processed_read_count = 0
    for genbank_filename in filenames:
        logging.info('processing %s', genbank_filename)
        with open_compressed(genbank_filename, 'rt') as genbank_file:
            for record in SeqIO.parse(genbank_file, 'genbank'):
                if (organism is None) or (organism
                                          == record.annotations['organism']):
                    sequence_length = len(record.seq)
                    if (max_length is None) or (sequence_length <= max_length):
                        processed_read_count += 1
                        yield seq_record_from_genbank(record)
                    else:
                        logging.info(
                            'record %s (%s) is too big, %d > %s, ignoring',
                            record.id, record.description, sequence_length,
                            max_length)

    logging.info('processed %s read pairs', processed_read_count)
Exemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser(
        description='get the clone counts in the given Avro files',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('lineage_label',
                        metavar='label',
                        help='the clone label to use')
    parser.add_argument('filenames',
                        metavar='file',
                        nargs='+',
                        help='the Avro files to read')
    args = parser.parse_args()

    clones_counts = defaultdict(int)

    for filename in args.filenames:
        with open_compressed(filename, 'rb') as read_handle:
            reader = fastavro.reader(read_handle)

            for record in reader:
                if args.lineage_label in record['lineages']:
                    subject = record['subject']
                    source = record['source']
                    type_ = record['sequence']['annotations']['target1']
                    lineage = record['lineages'][args.lineage_label]

                    clones_counts[(subject, source, type_, lineage)] += 1

    writer = csv.DictWriter(
        sys.stdout,
        fieldnames=['subject', 'source', 'type', 'lineage', 'read_count'])
    writer.writeheader()

    for (subject, source, type_, lineage), read_count in clones_counts.items():
        row = {
            'subject': subject,
            'source': source,
            'type': type_,
            'lineage': lineage,
            'read_count': read_count
        }
        writer.writerow(row)
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(description='extract immune receptor sequences from Genbank records',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('genbank_filename', metavar='genbank-file', help='the file with the Genbank records')
    parser.add_argument('igblast_output_filenames', metavar='parse.igblast', nargs='+', help='the output of IgBLAST to get the scores from')
    # options
    parser.add_argument('--min-v-score', metavar='S', type=float, default=70.0, help='the minimum score for the V-segment')
    parser.add_argument('--min-j-score', metavar='S', type=float, default=26.0, help='the minimum score for the V-segment')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    min_v_score = args.min_v_score
    min_j_score = args.min_j_score

    with open_compressed(args.genbank_filename, 'rt') as genbank_handle:
        for genbank_record, igblast_record in zip(SeqIO.parse(genbank_handle, 'genbank'),
                                                  igblast_parse_chain(args.igblast_output_filenames)):
            assert genbank_record.id == igblast_record.query_name.split(' ')[0]

            if igblast_record:
                # get the best scores
                best_scores = defaultdict(float)
                for align_line in igblast_record.alignment_lines[1:]:
                    segment_type = align_line.segment_type
                    align_score = igblast_record.significant_alignments[align_line.name]
                    # save the best score for each segment type
                    best_scores[segment_type] = max(best_scores[segment_type], align_score.bit_score)

                if best_scores['V'] >= min_v_score and best_scores['J'] >= min_j_score:
                    SeqIO.write(genbank_record, sys.stdout, 'genbank')

    elapsed_time = time.time() - start_time
    logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
Exemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'batch paired-end sequences from an Illumina run of an amplicon library',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input file
    parser.add_argument('fastq_filename',
                        metavar='file.fq',
                        default='-',
                        help='FASTQ file to convert')
    # options
    parser.add_argument('-t',
                        '--trim-label',
                        action='store_true',
                        help='trimmed the read label')

    args = parser.parse_args()

    # read the FASTQ file
    with open_compressed(args.fastq_filename, 'rt') as input_handle:
        for read_id, read_seq, read_qual in FastqGeneralIterator(input_handle):
            if args.trim_label:
                read_id = read_id.split(' ')[0]
            print('>%s\n%s' % (read_id, read_seq))
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(
        description='get the CDR3 length from an Avro file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('filename',
                        metavar='file',
                        help='the Avro file to read')
    args = parser.parse_args()

    reader = fastavro.reader(open_compressed(args.filename, 'rb'))

    subject = None
    read_counts_type = defaultdict(int)

    for record in reader:
        if subject is None:
            subject = record['subject']
        else:
            assert subject == record['subject']
        type_ = record['sequence']['annotations']['target1']
        read_counts_type[type_] += 1

    for type_, count in read_counts_type.items():
        print(subject, type_, count, sep='\t')
Exemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'batch paired-end sequences from an Illumina run of an amplicon library',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # directory to store the batches in
    parser.add_argument('batch_dirname',
                        metavar='dir',
                        help='name for the batch directory')
    # input files
    parser.add_argument('r1_filename',
                        metavar='r1_file',
                        help='the file with the read 1 sequences')
    parser.add_argument('r2_filename',
                        metavar='r2_file',
                        help='the file with the read 2 sequences')
    # parameters
    parser.add_argument('--batch-size',
                        '-b',
                        metavar='B',
                        type=int,
                        default=50000,
                        help='the number of read pairs to insert at a time')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    # check if the batch directory already exists
    if os.path.exists(args.batch_dirname):
        logging.error('batch direcotry %s already exists', args.batch_dirname)
        return 10
    else:
        os.mkdir(args.batch_dirname)

    read_count = 0
    batch_count = 0

    # read the FASTQ files
    with open_compressed(args.r1_filename, 'rt') as in_read1_handle, \
         open_compressed(args.r2_filename, 'rt') as in_read2_handle:

        # iterate over the read files
        for r1_batch, r2_batch in zip(
                batches(FastqGeneralIterator(in_read1_handle),
                        args.batch_size),
                batches(FastqGeneralIterator(in_read2_handle),
                        args.batch_size)):
            # filename prefix for the batches
            batch_prefix = os.path.join(args.batch_dirname,
                                        'batch%06d' % batch_count)

            logging.info('creating batch %06d', batch_count)

            # compressed batch outout files
            with gzip.open(batch_prefix + '.fq1.gz', 'wt') as out_read1_handle, \
                 gzip.open(batch_prefix + '.fq2.gz', 'wt') as out_read2_handle:

                # for each read pair in the batch
                for r1_read, r2_read in zip(r1_batch, r2_batch):
                    r1_id, r1_seq, r1_qual = r1_read
                    r2_id, r2_seq, r2_qual = r2_read

                    # check that the read ids are the same
                    assert r1_id.split(' ')[0] == r2_id.split(
                        ' ')[0], 'read ids do not match %s != %s' % (r1_id,
                                                                     r2_id)

                    out_read1_handle.write('@%s\n%s\n+\n%s\n' %
                                           (r1_id, r1_seq, r1_qual))
                    out_read2_handle.write('@%s\n%s\n+\n%s\n' %
                                           (r2_id, r2_seq, r2_qual))

                    read_count += 1

            batch_count += 1

    logging.info('processed %d read pairs', read_count)
    logging.info('created %d batches', batch_count)
    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 25
0
def avro_file_record_filter_iter(filenames, subject):
    for filename in filenames:
        logging.info('processing file %s', filename)
        for record in fastavro.reader(open_compressed(filename, 'rb')):
            if record['subject'] == subject:
                yield record
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser(
        description='generate some basic stats from a set of sequence records',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument('seq_record_filenames',
                        metavar='seq_record.avro',
                        nargs='+',
                        help='Avro files with the sequence records')
    #
    parser.add_argument('--parse-label',
                        '-p',
                        metavar='L',
                        help='collect stats on the given parse label')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    record_count = 0
    no_subject = 0
    no_subject_full_ident = 0
    no_subject_phix = 0
    no_subject_parsed = 0
    yes_subject = 0
    yes_subject_parsed = 0

    for filename in args.seq_record_filenames:
        logging.info('processing sequence record file %s', filename)
        with open_compressed(filename, 'rb') as input_handle:
            reader = fastavro.reader(input_handle)
            for record in reader:
                record_count += 1

                if record['subject'] is None:
                    no_subject += 1

                    if record['sequence']['annotations']['barcode1'] is not None and \
                       record['sequence']['annotations']['target1']  is not None and \
                       record['sequence']['annotations']['barcode1'] is not None and \
                       record['sequence']['annotations']['target2']  is not None:
                        no_subject_full_ident += 1

                    if record['sequence']['annotations']['phix1'] > 0 or \
                       record['sequence']['annotations']['phix2'] > 0:
                        no_subject_phix += 1

                    if args.parse_label:
                        if args.parse_label in record['parses'] and record[
                                'parses'][args.parse_label] is not None:
                            no_subject_parsed += 1
                else:
                    yes_subject += 1
                    if args.parse_label:
                        if args.parse_label in record['parses'] and record[
                                'parses'][args.parse_label] is not None:
                            yes_subject_parsed += 1

    print('processed %d records' % record_count)
    print('  %d (%0.2f%%) had subject' %
          (yes_subject, 100 * yes_subject / record_count))
    if args.parse_label:
        print('    %d (%0.2f%%) of those had parses (%s)' %
              (yes_subject_parsed, 100 * yes_subject_parsed / yes_subject,
               args.parse_label))
    print('  %d (%0.2f%%) had no subject' %
          (no_subject, 100 * no_subject / record_count))

    if no_subject == 0:
        pass
    else:
        print('    %d (%0.2f%%) of those had parses (%s)' %
              (no_subject_parsed, 100 * no_subject_parsed / no_subject,
               args.parse_label))
        print('    %d (%0.2f%%) of those where PhiX' %
              (no_subject_phix, 100 * no_subject_phix / no_subject))
        print(
            '    %d (%0.2f%%) of those had full idents' %
            (no_subject_full_ident, 100 * no_subject_full_ident / no_subject))

    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 27
0
def igblast_parse_chain(filenames):
    for filename in filenames:
        with open_compressed(filename, 'rt') as igblast_handle:
            igblast_parse_reader = IgBLASTParser(igblast_handle)
            for parse in igblast_parse_reader:
                yield parse
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser(
        description='get the CDR3 length from an Avro file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('parse_label',
                        metavar='label',
                        help='the parse label to use for the parse')
    parser.add_argument('filenames',
                        metavar='file',
                        nargs='+',
                        help='the Avro file to read')
    parser.add_argument('--lineage',
                        '-l',
                        metavar='L',
                        help='the lineage label to use')
    parser.add_argument('--min-v-score',
                        '-v',
                        metavar='S',
                        default=70,
                        help='minimum V-segment score')
    parser.add_argument('--min-j-score',
                        '-j',
                        metavar='S',
                        default=26,
                        help='minimum J-segment score')
    args = parser.parse_args()

    writer = None

    for filename in args.filenames:
        with open_compressed(filename, 'rb') as read_handle:
            reader = fastavro.reader(read_handle)

            for record in reader:
                parse = record['parses'][args.parse_label]
                best_v, v_score, _, _, _, j_score = best_vdj_score(parse)

                if v_score is not None and j_score is not None and \
                        v_score >= args.min_v_score and j_score >= args.min_j_score:

                    subject = record['subject']
                    type_ = record['sequence']['annotations']['target1']

                    v_j_in_frame = parse['v_j_in_frame']
                    has_stop_codon = parse['has_stop_codon']

                    best_q = get_parse_query(parse)
                    assert best_q['padding']['start'] == 0

                    q_align = best_q['alignment']
                    v_align = best_v['alignment']

                    mut_level = mutation_level(q_align, v_align)

                    if writer is None:
                        if args.lineage:
                            writer = csv.DictWriter(sys.stdout,
                                                    fieldnames=[
                                                        'subject', 'source',
                                                        'type', 'lineage',
                                                        'v_j_in_frame',
                                                        'has_stop_codon',
                                                        'mutation_level'
                                                    ])
                        else:
                            writer = csv.DictWriter(sys.stdout,
                                                    fieldnames=[
                                                        'subject', 'source',
                                                        'type', 'v_j_in_frame',
                                                        'has_stop_codon',
                                                        'mutation_level'
                                                    ])
                        writer.writeheader()

                    row = {
                        'subject': record['subject'],
                        'source': record['source'],
                        'type': type_,
                        'v_j_in_frame': v_j_in_frame,
                        'has_stop_codon': has_stop_codon,
                        'mutation_level': mut_level
                    }
                    if args.lineage:
                        if args.lineage in record['lineages']:
                            row['lineage'] = record['lineages'][args.lineage]

                    writer.writerow(row)
Exemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'sort the sequence records in the given Avro file into a HIVE style directory structure',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # input files
    parser.add_argument(
        'pathname_base',
        metavar='dir_path',
        help='the base pathname for the HIVE style directory structure')
    parser.add_argument('seq_record_filenames',
                        metavar='seq_rec.avro',
                        nargs='+',
                        help='the Avro file with the sequence records')
    # options
    arg_group = parser.add_mutually_exclusive_group(required=False)
    arg_group.add_argument('--no-none',
                           action='store_true',
                           help='do not process records without a subject')
    arg_group.add_argument('--only-none',
                           action='store_true',
                           help='only process records without a subject')

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)
    start_time = time.time()

    # make the base directory if it doesn't already exist
    base_path = args.pathname_base
    logging.info('making base directory %s', base_path)
    try:
        os.mkdir(base_path)
    except FileExistsError:
        logging.info('base directory already exists, adding to it')

    # load the records in and group them by subject and source into a list
    with tempfile.TemporaryDirectory() as temp_dir_name:
        logging.info('writing sequences to %s', temp_dir_name)
        temp_handles = {}
        temp_writers = {}
        for filename in args.seq_record_filenames:
            logging.info('loading sequence records from %s', filename)
            with open_compressed(filename, 'rb') as seq_record_handle:
                seq_record_reader = fastavro.reader(seq_record_handle)
                for record in seq_record_reader:
                    subject = record['subject']
                    source = record['source']

                    if args.no_none and source is None:
                        continue
                    elif args.only_none and source is not None:
                        continue

                    if subject not in temp_handles:
                        temp_handles[subject] = {}
                        temp_writers[subject] = {}
                    if source not in temp_handles[subject]:
                        temp_handles[subject][source] = open(
                            os.path.join(
                                temp_dir_name,
                                f'subject={subject},source={source}.avro'),
                            'wb')
                        temp_writers[subject][source] = open_avro(
                            temp_handles[subject][source],
                            seq_record_reader.writer_schema)
                    temp_writers[subject][source].write(record)

        # flush and close writers and handles
        for subject in temp_handles:
            for source in temp_handles[subject]:
                temp_writers[subject][source].flush()
                temp_handles[subject][source].close()
        del temp_writers

        logging.info('writing output')

        #
        for subject in temp_handles:
            subject_path = os.path.join(base_path, f'subject={subject}')
            # make sure the subject directory is create
            if os.path.isdir(subject_path):
                logging.info(
                    'using existing subject directory %s and adding to it',
                    subject_path)
            else:
                logging.info('making subject directory %s', subject_path)
                os.mkdir(subject_path)

            # for each (subject, source)
            for source in temp_handles[subject]:
                logging.info(
                    'loading and sorting records subject=%s/source=%s',
                    subject, source)
                with open(
                        os.path.join(
                            temp_dir_name,
                            f'subject={subject},source={source}.avro'),
                        'rb') as input_handle:
                    reader = fastavro.reader(input_handle)
                    records = list(reader)
                records.sort(key=itemgetter('name'))

                logging.info('writing records subject=%s/source=%s', subject,
                             source)
                output_filename = os.path.join(base_path, f'subject={subject}',
                                               f'source={source}.avro')
                with open(output_filename, 'wb') as output_handle:
                    fastavro.writer(output_handle,
                                    reader.writer_schema,
                                    records,
                                    codec='bzip2')
                del records

    elapsed_time = time.time() - start_time
    logging.info(
        'elapsed time %s',
        time.strftime('%H hours, %M minutes, %S seconds',
                      time.gmtime(elapsed_time)))
Exemplo n.º 30
0
def avro_1st_field_iterator(filename, fieldname):
    with open_compressed(filename, 'rb') as file_handle:
        reader = fastavro.reader(file_handle)
        for record in reader:
            yield record[fieldname]