def import_blast(filename): fp = file(filename) total_lines = sum(1 for row in fp) print >>sys.stderr, \ "Read BLAST file %(filename)s (total %(total_lines)d lines)" % locals() fp.seek(0) j = 0 for row in fp: j += 1 if j % 100000 == 0: print >>sys.stderr, j, "read..." atoms = row.split() a, b, bitscore = atoms[0], atoms[1], float(atoms[-1]) a, b = gene_name(a), gene_name(b) if a not in tandem_map or b not in tandem_map: continue a, b = tandem_map[a], tandem_map[b] if a == b: continue # keep the best blast hit if a not in blast_pool: blast_pool[a] = Hit(b, bitscore) else: blast_pool[a].update(b, bitscore) fp.close()
def import_blast(filename): fp = file(filename) total_lines = sum(1 for row in fp) print >>sys.stderr, \ "Read BLAST file %(filename)s (total %(total_lines)d lines)" % locals() fp.seek(0) j = 0 for row in fp: j += 1 if j % 100000 == 0: print >> sys.stderr, j, "read..." atoms = row.split() a, b, bitscore = atoms[0], atoms[1], float(atoms[-1]) a, b = gene_name(a), gene_name(b) if a not in tandem_map or b not in tandem_map: continue a, b = tandem_map[a], tandem_map[b] if a == b: continue # keep the best blast hit if a not in blast_pool: blast_pool[a] = Hit(b, bitscore) else: blast_pool[a].update(b, bitscore) fp.close()
def process_tandems(fp_blast, sizes, ranks, tandem): fp_blast.seek(0) total_lines = sum(1 for row in fp_blast) print >>sys.stderr, "Read self BLAST file (total %d lines)" % total_lines fp_blast.seek(0) j = 0 for row in fp_blast: j += 1 if j % 100000 == 0: print >>sys.stderr, j, "read..." atoms = row.split() a, b = atoms[:2] a, b = gene_name(a), gene_name(b) if a not in ranks or b not in ranks: continue chr_a, rank_a = ranks[a] chr_b, rank_b = ranks[b] if chr_a == chr_b and abs(rank_a - rank_b) <= Tandem_Nmax: tandem.join(a, b) tandem_removed = set() # the filtered gene set tandem_map = {} # unfiltered => filtered for tandem_group in tandem: longest_gene, longest_size = "", 0 for gene in tandem_group: if gene in sizes: gene_size = sizes[gene] if gene_size > longest_size: longest_gene, longest_size = gene, gene_size for gene in tandem_group: tandem_map[gene] = longest_gene tandem_removed.add(longest_gene) print >>sys.stderr, len(tandem_removed), "genes after tandem removal" return tandem_map, tandem_removed
def process_tandems(fp_blast, sizes, ranks, tandem): fp_blast.seek(0) total_lines = sum(1 for row in fp_blast) print >>sys.stderr, \ "Read self BLAST file (total %d lines)" % total_lines fp_blast.seek(0) j = 0 for row in fp_blast: j += 1 if j % 100000 == 0: print >> sys.stderr, j, "read..." atoms = row.split() a, b = atoms[:2] a, b = gene_name(a), gene_name(b) if a not in ranks or b not in ranks: continue chr_a, rank_a = ranks[a] chr_b, rank_b = ranks[b] if chr_a == chr_b and abs(rank_a - rank_b) <= Tandem_Nmax: tandem.join(a, b) tandem_removed = set() # the filtered gene set tandem_map = {} # unfiltered => filtered for tandem_group in tandem: longest_gene, longest_size = "", 0 for gene in tandem_group: if gene in sizes: gene_size = sizes[gene] if gene_size > longest_size: longest_gene, longest_size = gene, gene_size for gene in tandem_group: tandem_map[gene] = longest_gene tandem_removed.add(longest_gene) print >> sys.stderr, len(tandem_removed), "genes after tandem removal" return tandem_map, tandem_removed
def load_sizes(fp_sizes): # load the gene size info (for keeping longest gene in a tandem group) fp_sizes.seek(0) sizes = {} # gene => size print >>sys.stderr, "Read .sizes file" for row in fp_sizes: gene, size = row.split() # sizes are calculated for the transcripts, and we keep longest transcript gene, size = gene_name(gene), int(size) if gene not in sizes or size > sizes[gene]: sizes[gene] = size return sizes
def load_sizes(fp_sizes): # load the gene size info (for keeping longest gene in a tandem group) fp_sizes.seek(0) sizes = {} # gene => size print >> sys.stderr, "Read .sizes file" for row in fp_sizes: gene, size = row.split() # sizes are calculated for the transcripts, and we keep longest transcript gene, size = gene_name(gene), int(size) if gene not in sizes or size > sizes[gene]: sizes[gene] = size return sizes