Exemplo n.º 1
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = tuple(fasta.keys())

    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        else:
            outfile = sys.stdout
        try:
            for line in fetch_sequence(args, fasta, name, start, end):
                outfile.write(line)
        except FetchError as e:
            raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
Exemplo n.º 2
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension

    filt_function = re.compile(args.regex).search

    if args.invert_match:
        filt_function = lambda x: not re.compile(args.regex).search(x)

    fasta = Fasta(args.fasta,
                  default_seq=args.default_seq,
                  key_function=eval(args.header_function),
                  strict_bounds=not args.lazy,
                  split_char=args.delimiter,
                  filt_function=filt_function,
                  read_long_names=args.long_names,
                  rebuild=not args.no_rebuild)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = fasta.keys()

    header = False
    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.size_range:
            if start is not None and end is not None:
                sequence_len = end - start
            else:
                sequence_len = len(fasta[name])
            if args.size_range[0] > sequence_len or args.size_range[
                    1] < sequence_len:
                continue
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename
                               if c.isalnum() or c in keepcharacters)
            outfile = smart_open(filename, 'w', s3_upload=s3_config)
        elif args.out:
            outfile = args.out
        else:
            outfile = sys.stdout
        try:
            if args.transform:
                if not header and args.transform == 'nucleotide':
                    outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\tothers\n")
                    header = True
                outfile.write(transform_sequence(args, fasta, name, start,
                                                 end))
            else:
                for line in fetch_sequence(args, fasta, name, start, end):
                    outfile.write(line)
        except FetchError as e:
            raise FetchError(str(e) + " Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
Exemplo n.º 3
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    filt_function = re.compile(args.regex).search
    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = fasta.keys()
    if args.invert_match:
        sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
        fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild)
        regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
        split_function = ucsc_split

    header = False
    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.size_range:
            if start is not None and end is not None:
                sequence_len = end - start
            else:
                sequence_len = len(fasta[name])
            if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len:
                continue
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        elif args.out:
            outfile = args.out
        else:
            outfile = sys.stdout
        try:
            if args.transform:
                if not header and args.transform == 'nucleotide':
                    outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n")
                    header = True
                outfile.write(transform_sequence(args, fasta, name, start, end))
            else:
                for line in fetch_sequence(args, fasta, name, start, end):
                    outfile.write(line)
        except FetchError as e:
            raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()
Exemplo n.º 4
0
def write_sequence(args):
    _, ext = os.path.splitext(args.fasta)
    if ext:
        ext = ext[1:]  # remove the dot from extension
    filt_function = re.compile(args.regex).search
    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function)

    regions_to_fetch, split_function = split_regions(args)
    if not regions_to_fetch:
        regions_to_fetch = fasta.keys()
    if args.invert_match:
        sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
        fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter)
        regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
        split_function = ucsc_split

    header = False
    for region in regions_to_fetch:
        name, start, end = split_function(region)
        if args.size_range:
            if start is not None and end is not None:
                sequence_len = end - start
            else:
                sequence_len = len(fasta[name])
            if args.size_range[0] > sequence_len or args.size_range[1] < sequence_len:
                continue
        if args.split_files:  # open output file based on sequence name
            filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
            outfile = open(filename, 'w')
        elif args.out:
            outfile = args.out
        else:
            outfile = sys.stdout
        try:
            if args.transform:
                if not header and args.transform == 'nucleotide':
                    outfile.write("name\tstart\tend\tA\tT\tC\tG\tN\n")
                    header = True
                outfile.write(transform_sequence(args, fasta, name, start, end))
            else:
                for line in fetch_sequence(args, fasta, name, start, end):
                    outfile.write(line)
        except FetchError as e:
            raise FetchError(e.msg.rstrip() + "Try setting --lazy.\n")
        if args.split_files:
            outfile.close()
    fasta.__exit__()