Exemplo n.º 1
0
def dinuclShuffle(s, alph=alphabet.dna()):
    # check we can actually shuffle it
    if len(s) <= 2:
        return s
    # determine how to end the sequence
    edgeList = eulerian(s, alph)
    # turn the sequence into lists of following symbols
    symIList = computeList(s, alph)
    # remove last edges from each vertex list, shuffle, then add back
    # the removed edges at end of vertex lists.
    for [x, y] in edgeList:
        symIList[x].remove(y)
    for x in range(len(symIList)):
        random.shuffle(symIList[x])
    for [x, y] in edgeList:
        symIList[x].append(y)
    #construct the eulerian path
    prevSymI = alph.getIndex(s[0])
    L = [alph.getSymbol(prevSymI)]
    for i in range(len(s) - 2):
        symI = symIList[prevSymI].pop(0)
        L.append(alph.getSymbol(symI))
        prevSymI = symI
    symI = alph.getIndex(s[-1])
    L.append(alph.getSymbol(symI))
    return "".join(L)
def dinuclShuffle(s, alph = alphabet.dna()):
    # check we can actually shuffle it
    if len(s) <= 2:
        return s
    # determine how to end the sequence
    edgeList = eulerian(s, alph)
    # turn the sequence into lists of following symbols
    symIList = computeList(s, alph)
    # remove last edges from each vertex list, shuffle, then add back
    # the removed edges at end of vertex lists.
    for [x,y] in edgeList:
        symIList[x].remove(y)
    for x in range(len(symIList)):
        random.shuffle(symIList[x])
    for [x,y] in edgeList:
        symIList[x].append(y)
    #construct the eulerian path
    prevSymI = alph.getIndex(s[0])
    L = [alph.getSymbol(prevSymI)]
    for i in range(len(s)-2):
        symI = symIList[prevSymI].pop(0)
        L.append(alph.getSymbol(symI))
        prevSymI = symI
    symI = alph.getIndex(s[-1])
    L.append(alph.getSymbol(symI))
    return "".join(L)
Exemplo n.º 3
0
def main():

    pos_seq_file_name = None  # no positive sequence file specified
    neg_seq_file_name = None  # no negative sequence file specified
    alphabet_file_name = None
    refine = False
    given_only = False

    #
    # get command line arguments
    #
    usage = """USAGE:
    %s [options]

    -w <word>               word (required)
    -p <file_name>          positive sequences FASTA file name (required)
    -n <file_name>          negative sequences FASTA file name (required)
    -a <file_name>          alphabet definition file
    -r                      refine consensus by branching search
                            (distance 1 steps; beam size = 1).
    -h                      print this usage message

    Compute the Hamming distance from <word> to each FASTA sequence
    in the positive and negative files.  Apply Fisher's Exact test to
    each distance.
    <word> may contain ambiguous characters.

    """ % (sys.argv[0])

    # no arguments: print usage
    if len(sys.argv) == 1:
        print(usage, file=sys.stderr)
        sys.exit(1)

    # parse command line
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if (arg == "-w"):
            i += 1
            try:
                word = sys.argv[i]
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-p"):
            i += 1
            try:
                pos_seq_file_name = sys.argv[i]
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-n"):
            i += 1
            try:
                neg_seq_file_name = sys.argv[i]
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-a"):
            i += 1
            try:
                alphabet_file_name = sys.argv[i]
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-r"):
            try:
                refine = True
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-h"):
            print(usage, file=sys.stderr)
            sys.exit(1)
        else:
            print(usage, file=sys.stderr)
            sys.exit(1)
        i += 1

    # check that required arguments given
    if (pos_seq_file_name == None or neg_seq_file_name == None):
        print(usage, file=sys.stderr)
        sys.exit(1)

    # keep track of time
    start_time = time.time()

    # read alphabet
    alph = None
    if alphabet_file_name != None:
        alph = alphabet.loadFromFile(alphabet_file_name)
    else:
        alph = alphabet.dna()

    # read sequences
    print("Reading sequences...", file=sys.stderr)
    pos_seqs = get_strings_from_seqs(
        sequence.readFASTA(pos_seq_file_name, alph))
    neg_seqs = get_strings_from_seqs(
        sequence.readFASTA(neg_seq_file_name, alph))

    #print >> sys.stderr, "Computing Hamming enrichment..."
    #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only)

    if refine:
        (best_word,
         best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph,
                                             given_only)
    else:
        best_word = word

    print("Computing Hamming alignment...", file=sys.stderr)
    (dist, log_pvalue, p, P, n, N,
     aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph,
                                       given_only)
    pv_string = sprint_logx(log_pvalue, 1, _pv_format)
    nsites = len(aln)
    print("[", p, P, n, N, dist, "]", file=sys.stderr)
    print(
        "Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s"
        % (nsites, P, dist, pv_string),
        file=sys.stderr)
    print_meme_header(alph)
    print_meme_motif(best_word, nsites, pv_string, aln, alph)

    # print elapsed time
    end_time = time.time()
    elapsed = end_time - start_time
    print("elapsed time: %.2f seconds" % elapsed, file=sys.stderr)
    print("#elapsed time: %.2f seconds" % elapsed, file=sys.stdout)
Exemplo n.º 4
0
def main():

    #
    # defaults
    #
    file_name = None
    alphabet_file_name = None
    seed = 1
    copies = 1

    #
    # get command line arguments
    #
    usage = """USAGE:
    %s [options]

    -f <filename>   file name (required)
    -t <tag>        added to shuffled sequence names
    -s <seed>       random seed; default: %d
    -c <n>          make <n> shuffled copies of each sequence; default: %d
    -a <filename>   alphabet file to use non-DNA alphabets
    -h              print this usage message

    Note that fasta-shuffle-letters also supports dinucleotide shuffling and is faster.
    """ % (sys.argv[0], seed, copies)

    # no arguments: print usage
    if len(sys.argv) == 1:
        print(usage, file=sys.stderr)
        sys.exit(1)

    tag = ""

    # parse command line
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if (arg == "-f"):
            i += 1
            try:
                file_name = sys.argv[i]
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-t"):
            i += 1
            try:
                tag = sys.argv[i]
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-s"):
            i += 1
            try:
                seed = int(sys.argv[i])
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-c"):
            i += 1
            try:
                copies = int(sys.argv[i])
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-a"):
            i += 1
            try:
                alphabet_file_name = sys.argv[i]
            except:
                print(usage, file=sys.stderr)
                sys.exit(1)
        elif (arg == "-h"):
            print(usage, file=sys.stderr)
            sys.exit(1)
        else:
            print("Unknown command line argument: " + arg, file=sys.stderr)
            sys.exit(1)
        i += 1

    # check that required arguments given
    if (file_name == None):
        print(usage, file=sys.stderr)
        sys.exit(1)

    # get the alphabet, defaulting to DNA if it is not provided
    if alphabet_file_name != None:
        alph = alphabet.loadFromFile(alphabet_file_name)
    else:
        alph = alphabet.dna()

    random.seed(seed)

    # read sequences
    seqs = sequence.readFASTA(file_name, alph)

    for s in seqs:
        seq = s.getString()
        name = s.getName()
        for i in range(copies):
            shuffledSeq = dinuclShuffle(seq, alph)
            if (copies == 1):
                print(">%s\n%s" % (name + tag, shuffledSeq), file=sys.stdout)
            else:
                print(">%s_%d\n%s" % (name + tag, i, shuffledSeq),
                      file=sys.stdout)
def main():

    #
    # defaults
    #
    file_name = None
    alphabet_file_name = None
    seed = 1
    copies = 1

    #
    # get command line arguments
    #
    usage = """USAGE:
    %s [options]

    -f <filename>   file name (required)
    -t <tag>        added to shuffled sequence names
    -s <seed>       random seed; default: %d
    -c <n>          make <n> shuffled copies of each sequence; default: %d
    -a <filename>   alphabet file to use non-DNA alphabets
    -h              print this usage message

    Note that fasta-shuffle-letters also supports dinucleotide shuffling and is faster.
    """ % (sys.argv[0], seed, copies)

    # no arguments: print usage
    if len(sys.argv) == 1:
        print(usage, file=sys.stderr); sys.exit(1)

    tag = ""

    # parse command line
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if (arg == "-f"):
            i += 1
            try: file_name = sys.argv[i]
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-t"):
            i += 1
            try: tag = sys.argv[i]
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-s"):
            i += 1
            try: seed = int(sys.argv[i])
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-c"):
            i += 1
            try: copies = int(sys.argv[i])
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-a"):
            i += 1
            try: alphabet_file_name = sys.argv[i]
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-h"):
            print(usage, file=sys.stderr); sys.exit(1)
        else:
            print("Unknown command line argument: " + arg, file=sys.stderr)
            sys.exit(1)
        i += 1

    # check that required arguments given
    if (file_name == None):
        print(usage, file=sys.stderr); sys.exit(1)

    # get the alphabet, defaulting to DNA if it is not provided
    if alphabet_file_name != None:
        alph = alphabet.loadFromFile(alphabet_file_name)
    else:
        alph = alphabet.dna()

    random.seed(seed)

    # read sequences
    seqs = sequence.readFASTA(file_name, alph)

    for s in seqs:
        seq = s.getString()
        name = s.getName()
        for i in range(copies):
            shuffledSeq = dinuclShuffle(seq, alph)
            if (copies == 1):
                print(">%s\n%s" % (name+tag, shuffledSeq), file=sys.stdout)
            else:
                print(">%s_%d\n%s" % (name+tag, i, shuffledSeq), file=sys.stdout)
def main():

    pos_seq_file_name = None        # no positive sequence file specified
    neg_seq_file_name = None        # no negative sequence file specified
    alphabet_file_name = None
    refine = False
    given_only = False

    #
    # get command line arguments
    #
    usage = """USAGE:
    %s [options]

    -w <word>               word (required)
    -p <file_name>          positive sequences FASTA file name (required)
    -n <file_name>          negative sequences FASTA file name (required)
    -a <file_name>          alphabet definition file
    -r                      refine consensus by branching search
                            (distance 1 steps; beam size = 1).
    -h                      print this usage message

    Compute the Hamming distance from <word> to each FASTA sequence
    in the positive and negative files.  Apply Fisher's Exact test to
    each distance.
    <word> may contain ambiguous characters.

    """ % (sys.argv[0])

    # no arguments: print usage
    if len(sys.argv) == 1:
        print(usage, file=sys.stderr); sys.exit(1)

    # parse command line
    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if (arg == "-w"):
            i += 1
            try: word = sys.argv[i]
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-p"):
            i += 1
            try: pos_seq_file_name = sys.argv[i]
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-n"):
            i += 1
            try: neg_seq_file_name = sys.argv[i]
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-a"):
            i += 1
            try: alphabet_file_name = sys.argv[i]
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-r"):
            try: refine = True
            except: print(usage, file=sys.stderr); sys.exit(1)
        elif (arg == "-h"):
            print(usage, file=sys.stderr); sys.exit(1)
        else:
            print(usage, file=sys.stderr); sys.exit(1)
        i += 1

    # check that required arguments given
    if (pos_seq_file_name == None or neg_seq_file_name == None):
        print(usage, file=sys.stderr); sys.exit(1)

    # keep track of time
    start_time = time.time()

    # read alphabet
    alph = None
    if alphabet_file_name != None:
        alph = alphabet.loadFromFile(alphabet_file_name)
    else:
        alph = alphabet.dna()

    # read sequences
    print("Reading sequences...", file=sys.stderr)
    pos_seqs = get_strings_from_seqs(sequence.readFASTA(pos_seq_file_name, alph))
    neg_seqs = get_strings_from_seqs(sequence.readFASTA(neg_seq_file_name, alph))

    #print >> sys.stderr, "Computing Hamming enrichment..."
    #(dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(word, pos_seqs, neg_seqs, alph, given_only)

    if refine:
        (best_word, best_log_pvalue) = refine_consensus(word, pos_seqs, neg_seqs, alph, given_only)
    else:
        best_word = word

    print("Computing Hamming alignment...", file=sys.stderr)
    (dist, log_pvalue, p, P, n, N, aln) = get_best_hamming_alignment(best_word, pos_seqs, neg_seqs, alph, given_only)
    pv_string = sprint_logx(log_pvalue, 1, _pv_format)
    nsites = len(aln)
    print("[", p, P, n, N, dist, "]", file=sys.stderr)
    print("Best ZOOPs alignment has %d sites / %d at distance %d with p-value %s" % (nsites, P, dist, pv_string), file=sys.stderr)
    print_meme_header(alph)
    print_meme_motif(best_word, nsites, pv_string, aln, alph)

    # print elapsed time
    end_time = time.time()
    elapsed = end_time - start_time
    print("elapsed time: %.2f seconds" % elapsed, file=sys.stderr)
    print("#elapsed time: %.2f seconds" % elapsed, file=sys.stdout)