Пример #1
0
    def GenericTest(self, end, orientation, aligned_fname):
        factory = rad_factory.ReadAlignmentDataFactory(self.TN_PARAMS, end,
                                                       orientation)
        rads_by_id = factory.DictFromFiles(self.FILTERED_READS, aligned_fname)
        n_reads = 0
        for read_id, rad in rads_by_id.iteritems():
            n_reads += 1
            self.assertEquals(end, rad.fixed_seq_end)
            self.assertEquals(orientation, rad.fixed_seq_orientation)
            self.assertEquals(read_id, rad.read_record.id)
            self.assertEquals(self.TN_PARAMS.ValidLinker(rad.linker_seq),
                              rad.valid_linker)

            # Check completeness of data.
            self.assertIsNotNone(rad.insertion_site)
            self.assertIsNotNone(rad.insertion_idx)
            self.assertIsNotNone(rad.linker_seq)
            self.assertIsNotNone(rad.expected_insert_end_seq)
            self.assertIsNotNone(rad.in_frame_insertion)
            self.assertIsNotNone(rad.forward_insertion)
            self.assertIsNotNone(rad.backbone_match_strand)
            self.assertEquals(
                rad.forward_insertion,
                rad.fixed_seq_orientation == rad.backbone_match_strand)

            self.assertIsNotNone(rad.insert_start_idx)
            self.assertIsNotNone(rad.insert_end_idx)
            self.assertIsNotNone(rad.backbone_start_idx)
            self.assertIsNotNone(rad.backbone_end_idx)
            self.assertIsNotNone(rad.linker_start_idx)
            self.assertIsNotNone(rad.linker_end_idx)
            if rad.insert_start_idx >= 0:
                # Found the insert
                self.assertIsNotNone(rad.insert_match_strand)
                self.assertIsNotNone(rad.linker_start_idx)
                self.assertIsNotNone(rad.linker_end_idx)

        self.assertGreater(n_reads, 100)  # Tests at least 100 reads.

        outf = StringIO()
        factory.WriteCSVFile(rads_by_id.itervalues(), outf)
Пример #2
0
 def GetReadAlignementData(self, end, orientation, aligned_fname):
     factory = rad_factory.ReadAlignmentDataFactory(
         self.TN_PARAMS, end, orientation)
     rads_by_id = factory.DictFromFiles(self.FILTERED_READS,
                                        aligned_fname)
     return rads_by_id
Пример #3
0
def Main():
    parser = argparse.ArgumentParser(description='Filter reads.',
                                     fromfile_prefix_chars='@')
    parser.add_argument(
        "-i",
        "--insert_db_filename",
        required=True,
        help=("Path to FASTA file containing insert ends to align reads to. "
              "Will only retain reads that align well to this DB."))
    parser.add_argument("-r",
                        "--read_filenames",
                        nargs='+',
                        required=True,
                        help="Path to FASTQ files containing reads.")
    parser.add_argument("-t",
                        "--tmp_dir",
                        default="_read_filter_data",
                        help="Path to use to store intermediate files.")
    parser.add_argument("-o",
                        "--output_fname",
                        required=True,
                        help="Where to write output data (CSV).")
    TranspositionParams.AddArgs(parser)
    parser.set_defaults(summary_output=True)
    args = parser.parse_args()

    start_ts = time.time()
    command_util.CheckAllInstalled(
        ['fastq-grep', 'bbduk.sh', 'bowtie2', 'samtools'])

    tn_params = TranspositionParams.FromArgs(args)
    print tn_params

    insert_db_fname = args.insert_db_filename
    bbone_db_fname = args.backbone_db_filename
    read_fnames = ForceExpand(args.read_filenames)

    assert len(read_fnames) > 0, 'There better be read files'

    print '##### Retaining reads that match insert #####'
    insert_filtered_fnames = [
        MakeFname(i, 'fq', dest_dir=args.tmp_dir, postfix='insert_filtered')
        for i in read_fnames
    ]
    BBDukRetainMulti(insert_db_fname,
                     read_fnames,
                     insert_filtered_fnames,
                     k=12)

    print '##### Retaining reads also matching backbone #####'
    insert_bbone_filtered_fnames = [
        MakeFname(i,
                  'fq',
                  dest_dir=args.tmp_dir,
                  postfix='insert_bbone_filtered') for i in read_fnames
    ]
    BBDukRetainMulti(bbone_db_fname,
                     insert_filtered_fnames,
                     insert_bbone_filtered_fnames,
                     k=12)

    print '##### Masking insert in reads #####'
    filtered_masked_fnames = [
        MakeFname(i,
                  'fq',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_insert_masked') for i in read_fnames
    ]
    BBDukMaskMulti(insert_db_fname,
                   insert_bbone_filtered_fnames,
                   filtered_masked_fnames,
                   k=12,
                   kmask='Z')

    print '##### Trimming fixed sequence and insert from reads #####'
    fixed_5p = Seq(args.fixed_5p)
    fixed_3p = Seq(args.fixed_3p)
    pattern_5p = '%s[ATCG]{0,11}ZZZZZ' % args.fixed_5p
    pattern_3p = 'ZZZZZ[ATCG]{0,11}%s' % args.fixed_3p
    pattern_5p_rev = 'ZZZZZ[ATCG]{0,11}%s' % fixed_5p.reverse_complement()
    pattern_3p_rev = '%s[ATCG]{0,11}ZZZZZ' % fixed_3p.reverse_complement()

    trimmed_5p_fnames = [
        MakeFname(i,
                  'fq',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_5p') for i in read_fnames
    ]
    GrepTrimMulti(pattern_5p,
                  filtered_masked_fnames,
                  trimmed_5p_fnames,
                  trim_after=True,
                  trim_match=True)

    trimmed_3p_fnames = [
        MakeFname(i,
                  'fq',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_3p') for i in read_fnames
    ]
    GrepTrimMulti(pattern_3p,
                  filtered_masked_fnames,
                  trimmed_3p_fnames,
                  trim_before=True,
                  trim_match=True)

    trimmed_5p_rev_fnames = [
        MakeFname(i,
                  'fq',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_5p_rev') for i in read_fnames
    ]
    GrepTrimMulti(pattern_5p_rev,
                  filtered_masked_fnames,
                  trimmed_5p_rev_fnames,
                  trim_before=True,
                  trim_match=True)

    trimmed_3p_rev_fnames = [
        MakeFname(i,
                  'fq',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_3p_rev') for i in read_fnames
    ]
    GrepTrimMulti(pattern_3p_rev,
                  filtered_masked_fnames,
                  trimmed_3p_rev_fnames,
                  trim_after=True,
                  trim_match=True)

    print '##### Aligning to backbone #####'
    aligned_5p_fnames = [
        MakeFname(i,
                  'sam',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_5p_aligned') for i in read_fnames
    ]
    BBMapAlignMulti(bbone_db_fname, trimmed_5p_fnames, aligned_5p_fnames)

    aligned_3p_fnames = [
        MakeFname(i,
                  'sam',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_3p_aligned') for i in read_fnames
    ]
    BBMapAlignMulti(bbone_db_fname, trimmed_3p_fnames, aligned_3p_fnames)

    aligned_5p_rev_fnames = [
        MakeFname(i,
                  'sam',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_5p_rev_aligned')
        for i in read_fnames
    ]
    BBMapAlignMulti(bbone_db_fname, trimmed_5p_rev_fnames,
                    aligned_5p_rev_fnames)

    aligned_3p_rev_fnames = [
        MakeFname(i,
                  'sam',
                  dest_dir=args.tmp_dir,
                  postfix='filtered_trimmed_3p_rev_aligned')
        for i in read_fnames
    ]
    BBMapAlignMulti(bbone_db_fname, trimmed_3p_rev_fnames,
                    aligned_3p_rev_fnames)

    print '##### Indexing alignment output #####'
    aligned_5p_fnames_bam = SamtoolsIndexMutli(aligned_5p_fnames)
    aligned_3p_fnames_bam = SamtoolsIndexMutli(aligned_3p_fnames)
    aligned_5p_rev_fnames_bam = SamtoolsIndexMutli(aligned_5p_rev_fnames)
    aligned_3p_rev_fnames_bam = SamtoolsIndexMutli(aligned_3p_rev_fnames)

    print '##### Calculating insertions and writing output #####'

    # TODO: each of these calls to the factory is reading the same masked reads
    # again. If this is slow, should refactor into a container for the reads.
    # Otherwise, ignore.
    print 'Writing output to', args.output_fname
    out_f = open(args.output_fname, 'w')
    dict_writer = rad_factory.ReadAlignmentDataFactory.MakeDictWriter(out_f)

    total_matched_reads = 0
    forward = 1
    reverse = -1
    factory = rad_factory.ReadAlignmentDataFactory(tn_params, '5p', forward)
    read_data_5p = factory.DictFromFileLists(insert_bbone_filtered_fnames,
                                             aligned_5p_fnames_bam)
    total_matched_reads += len(read_data_5p)
    factory.WriteToDictWriter(dict_writer, read_data_5p.itervalues())
    del read_data_5p  # hint to GC

    factory = rad_factory.ReadAlignmentDataFactory(tn_params, '3p', forward)
    read_data_3p = factory.DictFromFileLists(insert_bbone_filtered_fnames,
                                             aligned_3p_fnames_bam)
    total_matched_reads += len(read_data_3p)
    factory.WriteToDictWriter(dict_writer, read_data_3p.itervalues())
    del read_data_3p  # hint to GC

    factory = rad_factory.ReadAlignmentDataFactory(tn_params, '5p', reverse)
    read_data_5p_rev = factory.DictFromFileLists(insert_bbone_filtered_fnames,
                                                 aligned_5p_rev_fnames_bam)
    total_matched_reads += len(read_data_5p_rev)
    factory.WriteToDictWriter(dict_writer, read_data_5p_rev.itervalues())
    del read_data_5p_rev  # hint to GC

    factory = rad_factory.ReadAlignmentDataFactory(tn_params, '3p', reverse)
    read_data_3p_rev = factory.DictFromFileLists(insert_bbone_filtered_fnames,
                                                 aligned_3p_rev_fnames_bam)
    total_matched_reads += len(read_data_3p_rev)
    factory.WriteToDictWriter(dict_writer, read_data_3p_rev.itervalues())
    del read_data_3p_rev  # hint to GC
    out_f.close()

    print 'Saved', total_matched_reads, 'matching reads'

    duration = time.time() - start_ts
    print 'Running time: %.2f minutes' % (duration / 60.0)
def Main():
    parser = argparse.ArgumentParser(description='Filter reads.',
                                     fromfile_prefix_chars='@')
    parser.add_argument(
        "-i",
        "--insert_db_filename",
        required=True,
        help=("Path to FASTA file containing insert ends to align reads to. "
              "Will only retain reads that align well to this DB."))
    parser.add_argument(
        "-b",
        "--backbone_db_filename",
        required=True,
        help=("Path to FASTA file containing backbone sequence. "
              "Will bin reads by where they align to this sequence."))
    parser.add_argument("-r",
                        "--read_filenames",
                        nargs='+',
                        required=True,
                        help="Path to FASTQ files containing reads.")
    parser.add_argument(
        "--start_offset",
        required=True,
        type=int,
        help=("Offset of the start codon into the sequence used "
              "to match the backbone (nt units)"))
    parser.add_argument("-t",
                        "--tmp_dir",
                        default="_read_filter_data",
                        help="Path to use to store intermediate files.")
    parser.add_argument("--fastq_ignore_degenerate",
                        default=False,
                        action='store_true',
                        help="If set, ignore reads with degenerate bases (N).")
    parser.add_argument("--blat_tile_size",
                        type=int,
                        default=8,
                        help="Tile size to use for BLAT search.")
    parser.add_argument("--blat_step_size",
                        type=int,
                        default=2,
                        help="Step size to use for BLAT search.")
    parser.add_argument("--blat_min_score",
                        type=int,
                        default=15,
                        help="Minimum score to retain a BLAT match.")
    parser.add_argument(
        "--blat_min_match",
        type=int,
        default=2,
        help="Minimum number of BLAT tiles to trigger matching.")
    parser.add_argument("--blat_max_gap",
                        type=int,
                        default=0,
                        help="Blat maximum number of gaps between tiles.")
    parser.add_argument(
        "--blat_one_off",
        type=int,
        default=0,
        help="Allow one mismatch in BLAT tile to trigger matching.")
    parser.add_argument(
        "--blat_rep_match",
        type=int,
        default=1000000,
        help="Number of tile repetitions before marked overused.")
    parser.add_argument("--blat_output_type",
                        default="pslx",
                        help="Blat output format")
    parser.add_argument("-o",
                        "--summary_output_csv_filename",
                        help="Where to write CSV output to.")
    parser.add_argument("-n",
                        "--no_summary_output",
                        dest='summary_output',
                        action='store_false')
    parser.add_argument("-s",
                        "--summary_output",
                        dest='summary_output',
                        action='store_true')
    parser.set_defaults(summary_output=True)
    args = parser.parse_args()
    """
    NOTE: BLAT documents guarantee finding of exact matches of length
        2*step_size + tile_size - 1
    With the default parameters above, this will give us all exact matches
    of 11 nt or longer.
    
    See documentation here:
        http://genome.ucsc.edu/FAQ/FAQblat.html#blat8 
    """

    # Check that everything we need exists.
    command_util.CheckAllInstalled(['fastq_to_fasta', 'blat'])

    # Get the filenames we are supposed to process.
    # TODO(flamholz): refactor this code so it's calling out to well-named functions.
    print 'Input read filenames', args.read_filenames
    read_filenames = filename_util.ForceExpand(args.read_filenames)
    print 'Read filenames', read_filenames
    read_filenames = filter(
        lambda n: path.splitext(n)[1] in ['.fq', '.fastq', '.fa', '.fasta'],
        read_filenames)
    print 'Read filenames:', ','.join(read_filenames)
    assert len(read_filenames) > 0, 'Must provide reads!'
    if args.summary_output:
        assert args.summary_output_csv_filename, 'Must provide output filename to write output.'

    # Check that all the input files exist.
    filename_util.CheckAllExist(read_filenames)

    # Make the temporary directory if needed.
    if not path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    # Convert the FASTQ input files to FASTA as BLAT doesn't seem
    # to take FASTQ input. NOTE: seems to ditch sequences containing uncalled bases
    # reported as "N" in this conversion. Revisit in future.
    all_fa = np.all(map(lambda fname: fname.endswith('.fa'), read_filenames))
    start_ts = time.time()
    if not all_fa:
        print 'Converting FASTQ to FASTA'
        keep_degenerate = not args.fastq_ignore_degenerate
        fasta_fnames = ConvertFASTQToFASTA(read_filenames,
                                           args.tmp_dir,
                                           keep_degenerate=keep_degenerate)
        duration = time.time() - start_ts
        print 'Finished converting to FASTA, took %.3f seconds' % duration
    else:
        fasta_fnames = read_filenames

    # Align the reads in FASTA format to the insert.
    print 'Aligning reads to insert database at %s' % args.insert_db_filename
    start_align_ts = time.time()
    insert_psl_fnames = AlignReadsToDB(
        args.insert_db_filename,
        fasta_fnames,
        args.tmp_dir,
        output_filename_postfix='insert_aligned',
        blat_tile_size=args.blat_tile_size,
        blat_step_size=args.blat_step_size,
        blat_min_score=args.blat_min_score,
        blat_min_match=args.blat_min_match,
        blat_one_off=args.blat_one_off,
        blat_rep_match=args.blat_rep_match,
        blat_max_gap=args.blat_max_gap,
        output_type='pslx')
    align_duration = time.time() - start_align_ts
    print 'Finished BLAT alignment to database, took %.3f seconds' % align_duration

    # Align the reads to the backbone.
    print 'Aligning reads to backbone database at %s' % args.backbone_db_filename
    start_align_ts = time.time()
    backbone_psl_fnames = AlignReadsToDB(
        args.backbone_db_filename,
        fasta_fnames,
        args.tmp_dir,
        output_filename_postfix='backbone_aligned',
        blat_tile_size=args.blat_tile_size,
        blat_step_size=args.blat_step_size,
        blat_min_score=args.blat_min_score,
        blat_min_match=args.blat_min_match,
        blat_one_off=args.blat_one_off,
        blat_rep_match=args.blat_rep_match,
        blat_max_gap=args.blat_max_gap,
        output_type='pslx')
    align_duration = time.time() - start_align_ts
    print 'Finished BLAT alignment to backbone, took %.3f seconds' % align_duration

    total_duration = time.time() - start_ts
    print 'Done aligning, took %.2f minutes' % (total_duration / 60.0)

    if not args.summary_output:
        print 'Not asked for summary output, bailing'
        return

    insert_aligned_fnames = insert_psl_fnames
    backbone_aligned_fnames = backbone_psl_fnames
    fasta_fnames = fasta_fnames

    # Make sure all the filenames are in the same order so we can zip them.
    insert_aligned_fnames = sorted(insert_aligned_fnames)
    backbone_aligned_fnames = sorted(backbone_aligned_fnames)
    fasta_fnames = sorted(fasta_fnames)
    assert insert_aligned_fnames
    assert backbone_aligned_fnames
    assert fasta_fnames
    assert len(insert_aligned_fnames) == len(backbone_aligned_fnames)
    assert len(fasta_fnames) == len(insert_aligned_fnames)
    print 'Insert aligned filenames'
    print insert_aligned_fnames
    print 'Backbone aligned filenames'
    print backbone_aligned_fnames
    print 'FASTA fnames'
    print fasta_fnames

    # Gather all the reads information by read ID.
    start_ts = time.time()
    rad_factory = factory.ReadAlignmentDataFactory(
        backbone_start_offset=args.start_offset,
        fixed_5p_seq=rad.DEFAULT_FIXED_5P_SEQ,
        fixed_3p_seq=rad.DEFAULT_FIXED_3P_SEQ)
    read_data_by_id = rad_factory.DictFromFileLists(insert_aligned_fnames,
                                                    backbone_aligned_fnames,
                                                    fasta_fnames)
    insertions = [r.has_insertion for r in read_data_by_id.itervalues()]
    fwd_insertions = [
        r.has_forward_insertion for r in read_data_by_id.itervalues()
    ]

    n_total_w_matches = len(read_data_by_id)
    n_insertions = np.sum(insertions)
    n_fwd_insertions = np.sum(fwd_insertions)

    print 'Total reads with any matches:', n_total_w_matches
    print 'Reads with insertions', n_insertions
    print 'Reads with forward insertions', n_fwd_insertions

    total_duration = time.time() - start_ts
    print 'Done collecting read statistics, took %.2f minutes' % (
        total_duration / 60.0)

    start_ts = time.time()
    out_fname = args.summary_output_csv_filename
    print 'Writing insertion matches to', out_fname
    rad_factory.WriteCSV(read_data_by_id.itervalues(), out_fname)
    total_duration = time.time() - start_ts
    print 'Done writing read statistics, took %.2f minutes' % (total_duration /
                                                               60.0)
Пример #5
0
def Main():
    parser = argparse.ArgumentParser(description='Filter reads.',
                                     fromfile_prefix_chars='@')
    parser.add_argument(
        "-i",
        "--insert_alignment_fname",
        required=True,
        help=("Path to FASTA file containing insert ends to align reads to. "
              "Will only retain reads that align well to this DB."))
    parser.add_argument(
        "-b",
        "--backbone_alignment_fname",
        required=True,
        help=("Path to FASTA file containing backbone sequence. "
              "Will bin reads by where they align to this sequence."))
    parser.add_argument("-r",
                        "--reads_fname",
                        required=True,
                        help="Path to FASTQ files containing reads.")
    parser.add_argument(
        "--start_offset",
        default=23,
        type=int,
        help=("Offset of the start codon into the sequence used "
              "to match the backbone (nt units)"))
    args = parser.parse_args()

    ofh = open('false_negative_reads.fa', 'w')
    writer = FastaIO.FastaWriter(ofh)
    writer.write_header()

    d_by_construct = {}

    FACTORY = factory.ReadAlignmentDataFactory(
        backbone_start_offset=args.start_offset,
        fixed_5p_seq=rad.DEFAULT_FIXED_5P_SEQ,
        fixed_3p_seq=rad.DEFAULT_FIXED_3P_SEQ)
    print args.reads_fname
    READ_DATA = FACTORY.DictFromFiles(args.insert_alignment_fname,
                                      args.backbone_alignment_fname,
                                      args.reads_fname)
    with open(args.reads_fname) as f:
        reader = SeqIO.parse(f, 'fasta')
        for record in reader:
            read_info = _parseReadInfo(record)

            cnum = read_info["construct_num"]
            d = d_by_construct.setdefault(cnum, read_info)

            found_match = (record.id in READ_DATA)
            should_match = read_info['should_match']
            true_pos = found_match and should_match
            false_pos = found_match and not should_match
            true_neg = not found_match and not should_match
            false_neg = not found_match and should_match

            d['total_reads'] = d.get('total_reads', 0) + 1
            d['true_pos'] = d.get('true_pos', 0) + true_pos
            d['false_pos'] = d.get('false_pos', 0) + false_pos
            d['true_neg'] = d.get('true_neg', 0) + true_neg
            d['false_neg'] = d.get('false_neg', 0) + false_neg

            if false_neg:
                writer.write_record(record)
    writer.write_footer()
    ofh.close()

    df = pd.DataFrame.from_dict(d_by_construct, orient='index')
    df.to_csv('match_stats.csv')