Пример #1
0
            all_threads[i].start()
        while len(mp.active_children()) > 1:
            time.sleep(1)
        queue.put("FINISHED")
        while len(mp.active_children()) > 0:
            time.sleep(1)
    else:
        queue = open(output_filename, 'w')
        for chrom in sorted(list(input.keys())):
            generate_bedgraph_lines(input[chrom], chrom, queue, parallel=False)
        queue.close()


#'chromosomes' contains the lengths of all chromosomes the that BEDGRAPH contains values for.

genome = fu.import_genome(args.fasta)
chromosomes = dict([(ID, len(transcript))
                    for ID, transcript in genome.items()])

#'coverage' is a nested dictionary of float vectors for each nucleotide in the genome.
# Contains a dictionary for each BEDGRAPH file with values at each position.
coverage = {}
graph = open(args.input)
coverage = {}

for line in graph:
    chrom, start, end, count = line.rstrip().split()
    count = float(count)
    if chrom not in coverage:
        coverage[chrom] = {}
Пример #2
0
                    choices=['none', '5p', '3p', 'both'])
parser.add_argument(
    "--nucfreqs",
    dest='NUCFREQS',
    help="Saves a table of nucleotide frequencies in selected genome region.",
    default=False,
    action="store_true")

args = parser.parse_args()

######################################
# ENVIRONMENT SETUP: DATA STRUCTURES #
######################################

if args.FASTA:
    genome = fu.import_genome(args.FASTA)

if args.SOFTCLIP_TYPE != 'none' and not args.FASTA:
    print(
        'ERROR: Untemplated nucleotide analysis requires a reference genome. Use the --fasta argument.'
    )
    sys.exit(1)

chromosomes = {}

file = open(args.SAMFILES[0])
line = file.readline()
while line[0] == '@':
    print(line.rstrip())
    l = line.rstrip().split('\t')
    if l[0] == '@SQ':
                    default=None,
                    help='filepath to output G content table')

args = parser.parse_args()


def which(x, value=True):
    return [a for a, b in enumerate(x) if b == value]


#'chromosomes' contains the lengths of all chromosomes the that BEDGRAPH contains values for.
# Expects a two-column tab-separated file with:
#    chromosome  length
# Provided with the 'lengths' argument.
if args.genome:
    genome = fu.import_genome(args.genome)

chromosomes = {}
lengths_file = open(args.lengths)
for line in lengths_file:
    chrom, length = line.rstrip().split('\t')
    chromosomes[chrom] = int(length)

#'coverage' is a nested dictionary of float vectors for each nucleotide in the genome.
# Contains a dictionary for each BEDGRAPH file with values at each position.
coverage = {}

ingraphs = args.input
for graph in ingraphs:
    print('Importing {}...'.format(graph))
    coverage[graph] = {}
Пример #4
0
    """Returns a list of locations in x that satisfy value"""
    return [a for a, b in enumerate(x) if b == value]


def notwhich(x, value=0):
    """Returns a list of locations in x that do not satisty value"""
    return [a for a, b in enumerate(x) if b != value]


def flatten(list_of_lists):
    """Collapses a list/tuple of lists into a single list"""
    return [item for sublist in list_of_lists for item in sublist]


if args.GENOME:
    genome = fu.import_genome(args.GENOME)
else:
    if args.FEATURE != 'transcript':
        print("ERROR: cannot locate {} features without a reference genome.".
              format(args.FEATURE))
        print("Provide genome FASTA file with -G")
        sys.exit(1)

coverage = None
if args.PLUS_BEDGRAPH:
    for i in args.PLUS_BEDGRAPH:
        if not coverage:
            coverage = bu.parse_bedgraph(i, '+')
        else:
            bu.add_bedgraph(coverage, i, '+')
Пример #5
0
            'exon_nums', []) + [exon_number]

ref_IDs = sorted(list(ref_transcripts.keys()))
print('# {} reference transcripts: {}'.format(len(ref_IDs),
                                              args.reference_GFF))

# 'picked_IDs' is an array of IDs to use from the reference_GFF
if args.subset:
    picked_IDs = [
        i.rstrip().split('\t')[0] for i in open(args.subset).readlines()
    ]
else:
    picked_IDs = ref_IDs

# 'genome' is a dict of strings for each chromosome in 'genome_fasta'
genome = fu.import_genome(args.genome_fasta)

# 'chromosomes' contains the lengths of all chromosomes the that BEDGRAPH contains values for.
chromosomes = {}
for chrom in genome.keys():
    length = len(genome[chrom])
    chromosomes[chrom] = int(length)

# 'coverage' is a dictionary of float vectors for each nucleotide in the genome.
# Contains the value of the BEDGRAPH file at each position.
coverage = {}
coverage['+'] = {}
coverage['-'] = {}

for chrom, chromlen in chromosomes.items():
    coverage['+'][chrom] = np.zeros(chromlen, dtype='float32')
]
notkeys = [
    'B', 'D', 'H', 'V', 'V', '-', 'K', 'Y', 'S', 'W', 'R', 'M', 'T', 'G', 'C',
    'A', 'N', '.'
]

for k, v, c, n in zip(keys, values, complements, notkeys):
    IUPAChash[k] = v
    IUPACcomp[k] = c
    IUPACnot[k] = n

#################################
# LOADING DATA FROM INPUT FILES #
#################################

genome = fu.import_genome(args.genome_FASTA, keep_case=False)
chromosomes = {}

for k, v in genome.items():
    chromosomes[k] = len(v)

linecounter = 0
if __name__ == '__main__':
    for line in open(args.match_file):
        linecounter += 1
        if line[0] == '#':
            continue
        try:
            BED_filename, search_sequence, mismatches = line.rstrip().split(
                ' ')
        except: