def get_dupes(lastz_file, regex, format):
    """Given a lastz_file of probes aligned to themselves, get duplicates"""
    matches = defaultdict(list)
    dupes = set()
    # get names and strip probe designation since loci are the same
    print "Parsing lastz file..."
    for lz in lastz.Reader(lastz_file, long_format=format):
        target_name = new_get_probe_name(lz.name1, regex)
        query_name = new_get_probe_name(lz.name2, regex)
        matches[target_name].append(query_name)
    # see if one probe matches any other probes
    # other than the children of the locus
    print "Screening results..."
    for k, v in matches.iteritems():
        # if the probe doesn't match itself, we have
        # problems
        if len(v) > 1:
            for i in v:
                if i != k:
                    dupes.add(k)
                    dupes.add(i)
        elif k != v[0]:
            dupes.add(k)
    # make sure all names are lowercase
    return set([d.lower() for d in dupes])
def main():
    args = get_args()
    log = setup_logger()
    for file in glob.glob(os.path.join(args.input, "*lastz*")):
        lz = lastz.Reader(file, long_format=True)
        probes = defaultdict(list)
        # get output file name
        #outname = os.path.basename(file).split('.')[1].split('_')[-1]
        search_result = re.search('_v_([A-Za-z0-9]+).lastz', os.path.basename(file))
        outname = search_result.groups()[0]
        log.info("Working on {}".format(outname))
        outf = open(os.path.join(args.output, "{}.probe.bed".format(outname)), 'w')
        outf.write('''track name="uce-v-{0}" description="UCE probe matches to {0}" visibility=2 itemRgb="On"\n'''.format(outname))
        written = set([])
        for match in lz:
            probe = match.name2.split('|')[0].strip()
            probes[probe].append([match.name1, match.zstart1, match.end1])
        #pdb.set_trace()
        for probe in sorted(probes.keys()):
            for match in probes[probe]:
                chromo, start, end = match
                if probe in written:
                    log.warn("{0} may have >1 hit".format(probe))
                else:
                    written.add(probe)
                write_bed_file(outf, chromo, start, end, probe)
        outf.close()
示例#3
0
def get_dupe_matches(lastz_file, splitchar = "|", pos = 1, longfile = False):
    matches = defaultdict(list)
    for lz in lastz.Reader(lastz_file, longfile):
        target_name = get_name(lz.name1, splitchar, pos)
        query_name = get_name(lz.name2, splitchar, pos)
        matches[target_name].append(query_name)
    return matches
def get_dupe_matches(lastz_file, longfile=False):
    matches = defaultdict(list)
    for lz in lastz.Reader(lastz_file, longfile):
        target_name = get_uce_name(lz.name1)
        query_name = get_uce_name(lz.name2)
        matches[target_name].append(query_name)
    return matches
def main():
    args = get_args()
    uce_loci = []
    # get lengths of loci
    seq_lengths = {}
    for seq in fasta.FastaReader(args.fasta):
        name = seq.identifier.split('|')[1]
        uce_loci.append(name)
        seq_lengths[name] = len(seq.sequence)
    overlappers = defaultdict(dict)
    names = defaultdict(list)
    coords = {}
    for match in lastz.Reader(args.lastz, long_format=True):
        locus = match.name2.split('|')[1]
        chromo = match.name1
        coords[locus] = (match.zstart1, match.end1)
        for pmatch, span in overlappers[chromo].iteritems():
            if locus == 'chr5_10696_s' and pmatch == 'chr13_710_s':
                pdb.set_trace()
            overlap = span.find(match.zstart1, match.end1)
            if overlap:
                overlappers[chromo][pmatch].insert(match.zstart1, match.end1,
                                                   locus)
                names[pmatch].append(locus)
                break
        else:
            overlappers = add_new_locus(match, overlappers, chromo)
    overlapping_loci = []
    all_groups = []
    for k, v in names.iteritems():
        # group loci into overlapping clusters
        base = [k]
        base.extend(v)
        all_groups.append(base)
        # get list of "bad loci" so we can determine non-overlappers
        overlapping_loci.append(k)
        overlapping_loci.extend(v)
    pdb.set_trace()
    non_overlapping_loci = set(uce_loci).difference(set(overlapping_loci))
    # generate output in config-file format:
    config = ConfigParser.RawConfigParser()
    config.add_section('Non-overlapping loci')
    for locus in list(non_overlapping_loci):
        config.set('Non-overlapping loci', locus, seq_lengths[locus])
    longest_of_overlapping = get_longest_of_overlapping_loci(
        all_groups, seq_lengths)
    config.add_section('Longest loci of group')
    for locus in longest_of_overlapping:
        config.set('Longest loci of group', locus, seq_lengths[locus])
    config.add_section('Superlocus groups')
    for c, group in enumerate(all_groups):
        # order loci by start position
        starts = [(name, coords[name][0], coords[name][1]) for name in group]
        starts = sorted(starts, key=itemgetter(1))
        sorted_names = [n[0] for n in starts]
        print starts
        #pdb.set_trace()
        config.set('Superlocus groups', "Group{0}".format(c),
                   ','.join(sorted_names))
    config.write(args.output)
示例#6
0
def get_dupes(lastz_file, format):
    """Given a lastz_file of probes aligned to themselves, get duplicates"""
    matches = defaultdict(list)
    dupes = set()
    # get names and strip probe designation since loci are the same
    print "Parsing lastz file..."
    for lz in lastz.Reader(lastz_file, long_format=format):
        target_name = "{}:{}-{}".format(lz.name1, lz.zstart1, lz.end1)
        query_name = lz.name2
        matches[query_name].append(target_name)
    # see if one probe matches any other probes
    # other than the children of the locus
    print "Screening results..."
    for k, v in matches.iteritems():
        name, pos = k.split('|')
        name = name.strip()
        # if the probe doesn't match itself, we have
        # problems
        if len(v) == 1:
            if not pos == v[0]:
                dupes.add(name)
        elif v > 1:
            dupes.add(name)
    # make sure all names are lowercase
    return set([d.lower() for d in dupes])
示例#7
0
def main(lformat=True):
    args = get_args()
    if args.conf and args.sections:
        conf = ConfigParser.ConfigParser()
        conf.read(args.conf)
        if not args.sections:
            args.sections = conf.sections()
        items = []
        for section in args.sections:
            items.extend([i[0] for i in conf.items(section)])
        items = set(items)
    else:
        items = None
    for match in lastz.Reader(args.lastz, long_format=args.long_format):
        try:
            name = match.name2.split('|')[1]
        except:
            name = match.name2.split(' ')[0]
        if match.percent_identity >= args.identity and match.percent_continuity >= args.continuity:
            if args.conf and items and (name in items):
                write_to_outfile(args, match, name)
            elif args.conf is None:
                write_to_outfile(args, match, name)
        else:
            print name
    args.output.close()
def get_bgi_matches(lastz_file, stripnum):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format=True):
        uce_name = re.sub(stripnum, 's', lz.name2).lower()
        probe_number = int(lz.name2.split('_')[-1])
        if probe_number > probes[uce_name]:
            probes[uce_name] = probe_number
        matches[uce_name].append(
            [get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1])
    return matches, probes
def get_matches(lastz_file):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format=True):
        uce_name = get_uce_name(lz.name2)
        probe_number = get_uce_num(lz.name2)
        if probe_number > probes[uce_name]:
            probes[uce_name] = probe_number
        matches[uce_name].append(
                [
                    get_name(lz.name1).lower(),
                    lz.strand2,
                    lz.zstart1,
                    lz.end1
                ]
            )
    return matches, probes
def main():
    args = get_args()
    conf = ConfigParser.ConfigParser()
    conf.read(args.conf)
    all_files = get_all_files_from_conf(conf)
    for genome in all_files:
        name, twobit_name = genome
        out_file = os.path.join(args.output, name) + ".fasta"
        out = fasta.FastaWriter(out_file)
        tb = twobit.TwoBitFile(file(twobit_name))
        lz = os.path.join(args.lastz, name) + ".lastz"
        count = 0
        for row in lastz.Reader(lz, long_format=True):
            sequence = slice_and_return_fasta(tb, row, args.flank)
            out.write(sequence)
            count += 1
        print "\t{} sequences written to {}".format(count, out_file)
        out.close()
示例#11
0
def get_matches(lastz_file, splitchar, components, fish = False):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format = True):
        # skip silly hg19 mhc haplotypes
        if "hap" in lz.name1:
            print "Skipping: ", lz.name1
        else:
            if fish:
                uce_name = get_name(lz.name2, "_", 1)
                # add 1 because fish probe indexing starts @ 0
                probe_number = int(lz.name2.split('|')[1].split('_')[1]) + 1
            else:
                uce_name = get_name(lz.name2, "|", 1)
                probe_number = int(lz.name2.split(':')[-1])

            #pdb.set_trace()
            if probe_number > probes[uce_name]:
                probes[uce_name] = probe_number
            matches[uce_name].append([get_name(lz.name1, splitchar = splitchar, items = components), lz.strand2, lz.zstart1, lz.end1])
    return matches, probes
def main():
    args = get_args()
    conf = ConfigParser.ConfigParser()
    conf.optionxform = str
    conf.read(args.conf)
    all_files = get_all_files_from_conf(conf, args.pattern)
    #pdb.set_trace()
    for genome in all_files:
        short_name, long_name, twobit_name = genome
        if not args.exclude or (short_name not in args.exclude):
            out_file = os.path.join(args.output, short_name) + ".fasta"
            out = fasta.FastaWriter(out_file)
            tb = twobit.TwoBitFile(file(twobit_name))
            lz = os.path.join(args.lastz, long_name)
            count = 0
            for row in lastz.Reader(lz, long_format=True):
                sequence = slice_and_return_fasta(tb, row, args.flank)
                out.write(sequence)
                count += 1
            print "\t{} sequences written to {}".format(count, out_file)
            out.close()
示例#13
0
def get_dupes(lastz_file, regex=None, repl=None):
    """Given a lastz_file of probes aligned to themselves, get duplicates"""
    matches = defaultdict(list)
    dupes = set()
    for lz in lastz.Reader(lastz_file):
        target_name = get_name(lz.name1, "|", 1)
        query_name = get_name(lz.name2, "|", 1)
        matches[target_name].append(query_name)
    # see if one probe matches any other probes
    # other than the children of the locus
    for k, v in matches.iteritems():
        # if the probe doesn't match itself, we have
        # problems
        if len(v) > 1:
            for i in v:
                if i != k:
                    dupes.add(k)
                    dupes.add(i)
        elif k != v[0]:
            dupes.add(k)
    if not regex:
        return dupes
    else:
        return set([re.sub(regex, repl, d).lower() for d in dupes])
示例#14
0
def main():
    args = get_args()
    if args.regex and args.repl is not None:
        # "s_[0-9]+$"
        regex = re.compile(args.regex)
        uces = set([get_name(read.identifier, "|", 1, regex=regex, repl=args.repl)
            for read in fasta.FastaReader(args.query)])
    else:
        uces = set([get_name(read.identifier, "|", 1)
            for read in fasta.FastaReader(args.query)])
        regex = None
    if args.dupefile:
        print "\t Getting dupes"
        dupes = get_dupes(args.dupefile, regex, args.repl)
    contig = args.contigs#glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = ["contigs"]#get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database( uces )
    print "Processing:"
    #for contig in fasta_files:
    critter = os.path.basename(contig).split('.')[0].replace('-', "_")
    #output = args.align 
    # os.path.join(
    #         args.align, \
    #         os.path.splitext(os.path.basename(contig))[0] + '.lastz'
    #      )
    contigs = contig_count(contig)
    # align the probes to the contigs
    alignment = lastz.Align(
              contig,
              args.query,
              args.coverage,
              args.identity,
              args.align 
            )
    lzstdout, lztstderr = alignment.run()
    # parse the lastz results of the alignment
    matches, orientation, revmatches = \
                defaultdict(set), defaultdict(set), defaultdict(set)
    probe_dupes = set()
    if not lztstderr:
        for lz in lastz.Reader(args.align ):
            # get strandedness of match
            contig_name = get_name(lz.name1)
            uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl)
            if args.dupefile and uce_name in dupes:
                probe_dupes.add(uce_name)
            else:
                matches[contig_name].add(uce_name)
                orientation[uce_name].add(lz.strand2)
                revmatches[uce_name].add(contig_name)
    else:
        print "Error in lastz:"
        print "STDerr:"
        print lztstderr
        print "STDout:"
        print lzstdout

    # we need to check nodes for dupe matches to the same probes
    contigs_matching_mult_uces = check_contigs_for_dupes(matches)
    uces_matching_mult_contigs = check_probes_for_dupes(revmatches)
    nodes_to_drop = contigs_matching_mult_uces
    nodes_to_drop_one_of = uces_matching_mult_contigs
    # remove dupe and/or dubious nodes/contigs
    match_copy = copy.deepcopy(matches)
    already_observed = list()
    for k in match_copy.keys():
        if k in nodes_to_drop:
            del matches[k]
        elif k in nodes_to_drop_one_of:
        	if matches[k] in already_observed:
        		del matches[k]
        	else:
        		already_observed.append(matches[k])
    store_lastz_results_in_db(c, matches, orientation, critter)
    conn.commit()
    pretty_print_output(
                critter,
                matches,
                contigs,
                probe_dupes,
                contigs_matching_mult_uces,
                uces_matching_mult_contigs
            )
    # get all the UCE records from the db
    query = "SELECT uce, {0} FROM match_map WHERE {0} IS NOT NULL".format("contigs")
    c.execute(query)
    data = {row[1].split("(")[0]:row[0] for row in c.fetchall()}
    nodenames = set(data.keys())
    # make sure we don't lose any dupes
    assert len(data) == len(nodenames), "There were duplicate contigs."
    outp = open(args.output, 'w')
    print "Building UCE fasta:"
    #for contig in fasta_files:
    for record in SeqIO.parse(open(contig), 'fasta'):
        name = '_'.join(record.id.split('_')[:2])
        if name.lower() in nodenames:
            record.id = "{0}|{1}".format(data[name.lower()], record.id)
            outp.write(record.format('fasta'))
    outp.close()
示例#15
0
def main(args):
    #args = get_args()
    pre_regex = args.regex
    regex = re.compile("^(%s)(?:.*)" % pre_regex)
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        raise IOError(
            "The directory {} already exists.  Please check and remove by hand."
            .format(args.output))
    exons = set(
        new_get_probe_name(seq.id, regex)
        for seq in SeqIO.parse(open(args.reference, 'rU'), 'fasta'))
    #print exons
    if args.dupefile:
        dupes = get_dupes(log, args.dupefile, regex)
    else:
        dupes = set()
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    for f in fasta_files:
        replace_bad_fasta_chars = "sed -i -e '/>/! s=[K,Y,R,S,M,W,B,D,H,V,k,y,r,s,m,w,b,d,h,v]=N=g' %s" % f
        remove_os_sed_copies = "rm %s/*-e " % args.contigs
        fasta_name = f.split('/')[-1]
        if not fasta_name.startswith('sample'):
            rename_samples = "mv %s %s/sample_%s" % (f, args.contigs,
                                                     fasta_name)
            os.system(rename_samples)
        os.system(replace_bad_fasta_chars)
        os.system(remove_os_sed_copies)
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database(
        log, os.path.join(args.output, 'probe.matches.sqlite'), organisms,
        exons)
    log.info("Processing contig data")
    # open a file for duplicate writing, if we're interested
    if args.keep_duplicates is not None:
        dupefile = open(args.keep_duplicates, 'w')
    else:
        dupefile = None
    log.info("{}".format("-" * 65))
    kmers = {}
    for contig in sorted(fasta_files):
        critter = os.path.basename(contig).split('.')[0].replace('-', "_")
        output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(contig))[0] + '.lastz')
        contigs = contig_count(contig)
        # align the probes to the contigs
        alignment = lastz.Align(contig, args.reference, args.min_coverage,
                                args.min_identity, output)
        lzstdout, lztstderr = alignment.run()
        if lztstderr:
            raise EnvironmentError("lastz: {}".format(lztstderr))
        # parse the lastz results of the alignment
        matches = defaultdict(set)
        orientation = defaultdict(set)
        revmatches = defaultdict(set)
        probe_dupes = set()
        if not lztstderr:
            for lz in lastz.Reader(output):
                contig_name = get_contig_name(lz.name1, args)
                exon_name = new_get_probe_name(lz.name2, regex)
                if args.dupefile and exon_name in dupes:
                    probe_dupes.add(exon_name)
                else:
                    matches[contig_name].add(exon_name)
                    orientation[exon_name].add(lz.strand2)
                    revmatches[exon_name].add(contig_name)

        # we need to check nodes for dupe matches to the same probes
        contigs_matching_mult_exons = check_contigs_for_dupes(matches)
        exon_dupe_contigs, exon_dupe_exons = check_loci_for_dupes(revmatches)
        nodes_to_drop = contigs_matching_mult_exons.union(exon_dupe_contigs)
        # write out duplicates if requested
        if dupefile is not None:
            log.info("Writing duplicates file for {}".format(critter))
            if len(exon_dupe_exons) != 0:
                dupefile.write(
                    "[{} - probes hitting multiple contigs]\n".format(critter))
                for exon in exon_dupe_exons:
                    dupefile.write("{}:{}\n".format(
                        exon, ', '.join(revmatches[exon])))
                dupefile.write("\n")
            if len(contigs_matching_mult_exons) != 0:
                dupefile.write(
                    "[{} - contigs hitting multiple probes]\n".format(critter))
                for dupe in contigs_matching_mult_exons:
                    dupefile.write("{}:{}\n".format(dupe,
                                                    ', '.join(matches[dupe])))
                dupefile.write("\n")
                dupefile.write("[{} - contig orientation]\n".format(critter))
                for dupe in contigs_matching_mult_exons:
                    matches_list = list(matches[dupe])
                    for exon in matches_list:
                        dupefile.write("{}:{}\n".format(
                            exon,
                            list(orientation[exon])[0]))
                dupefile.write("\n")

        # remove dupe and/or dubious nodes/contigs
        match_copy = copy.deepcopy(matches)
        for k in match_copy.keys():
            if k in nodes_to_drop:
                del matches[k]
        #print matches
        #print lz.name1
        #get contig id
        #contig_id = re.search("^(\d*)\s\d*\s\d*.*", lz.name1).groups()[0]
        #print matches

        #added function to return the kmer count (sum of all kmers of target contigs)
        for lz in lastz.Reader(output):
            for element in matches:
                #print element, "has to match", lz[1]
                if re.search("^(\d*)\s\d*\s\d*.*",
                             lz[1]).groups()[0] == element:
                    kmer_value = get_kmer_value(lz.name1)
                    kmers.setdefault(contig, [])
                    kmers[contig].append(kmer_value)
        store_lastz_results_in_db(c, matches, orientation, critter)
        conn.commit()
        pretty_log_output(log, critter, matches, contigs, probe_dupes,
                          contigs_matching_mult_exons, exon_dupe_exons)

    kmerfile = open(os.path.join(args.output, 'kmer_count.txt'), 'w')

    for key in kmers:
        count = 0
        for element in kmers[key]:
            count += int(element)
        kmerfile.write("%s : %d\n" %
                       (os.path.basename(key).split('.')[0], count))

    if dupefile is not None:
        dupefile.close()
    log.info("{}".format("-" * 65))
    log.info("The LASTZ alignments are in {}".format(args.output))
    log.info("The exon match database is in {}".format(
        os.path.join(args.output, "probes.matches.sqlite")))
    text = "Completed"

    log.info(text.center(65, "="))

    # Access the SQL file and export tab-separated text-file
    sql_file = os.path.join(args.output, 'probe.matches.sqlite')
    tsf_out = os.path.join(args.output, 'match_table.txt')
    sql_cmd = "%s -header -nullvalue '.' -separator '\t' %s \"select * from matches;\" > %s" % (
        args.sqlite3, sql_file, tsf_out)
    os.system(sql_cmd)

    # Create the config file for the extraction of the desired loci
    output_folder = args.output

    with open(os.path.join(output_folder, 'config'), 'w') as f:
        print('[Organisms]', file=f)
        for aln in glob.glob(os.path.join(output_folder, '*.lastz')):
            aln = os.path.basename(aln)
            #aln = aln.split('_')[0]
            aln = aln.replace('.lastz', '')
            print(aln, file=f)

        print('\n[Loci]', file=f)
        with open(os.path.join(output_folder,
                               'match_table.txt')) as match_table:
            lines = match_table.readlines()
        for line in lines[1:]:
            print(line.split('\t')[0], file=f)
示例#16
0
def main():
    args = get_args()
    if args.regex and args.repl is not None:
        # "s_[0-9]+$"
        regex = re.compile(args.regex)
        uces = set([
            get_name(read.identifier, "|", 1, regex=regex, repl=args.repl)
            for read in fasta.FastaReader(args.query)
        ])
    else:
        uces = set([
            get_name(read.identifier, "|", 1)
            for read in fasta.FastaReader(args.query)
        ])
        regex = None
    if args.dupefile:
        print "\t Getting dupes"
        dupes = get_dupes(args.dupefile, regex, args.repl)
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database(
        os.path.join(args.output, 'probe.matches.sqlite'), organisms, uces)
    print "Processing:"
    for contig in fasta_files:
        critter = os.path.basename(contig).split('.')[0].replace('-', "_")
        output = os.path.join(
                    args.output, \
                    os.path.splitext(os.path.basename(contig))[0] + '.lastz'
                )
        contigs = contig_count(contig)
        # align the probes to the contigs
        alignment = lastz.Align(contig, args.query, args.coverage,
                                args.identity, output)
        lzstdout, lztstderr = alignment.run()
        # parse the lastz results of the alignment
        matches, orientation, revmatches = \
                defaultdict(set), defaultdict(set), defaultdict(set)
        probe_dupes = set()
        if not lztstderr:
            for lz in lastz.Reader(output):
                # get strandedness of match
                contig_name = get_name(lz.name1)
                uce_name = get_name(lz.name2,
                                    "|",
                                    1,
                                    regex=regex,
                                    repl=args.repl)
                if args.dupefile and uce_name in dupes:
                    probe_dupes.add(uce_name)
                else:
                    matches[contig_name].add(uce_name)
                    orientation[uce_name].add(lz.strand2)
                    revmatches[uce_name].add(contig_name)
        # we need to check nodes for dupe matches to the same probes
        contigs_matching_mult_uces = check_contigs_for_dupes(matches)
        uces_matching_mult_contigs = check_probes_for_dupes(revmatches)
        nodes_to_drop = contigs_matching_mult_uces.union(
            uces_matching_mult_contigs)
        # remove dupe and/or dubious nodes/contigs
        match_copy = copy.deepcopy(matches)
        for k in match_copy.keys():
            if k in nodes_to_drop:
                del matches[k]
        store_lastz_results_in_db(c, matches, orientation, critter)
        conn.commit()
        pretty_print_output(critter, matches, contigs, probe_dupes,
                            contigs_matching_mult_uces,
                            uces_matching_mult_contigs)