def blast_to_df_iter(fn, delimiter='\t', chunksize=10000, remap=True): '''Iterator of DataFrames of length chunksize parsed from an NCBI BLAST+ `-outfmt6` file. Native BLAST+ uses an interval of the form [start,end), start >= 1. This changes to [end,start) when on the negative strand, apparently solely to make other bioinformaticians suffer. We convert to proper 0-based, half-open intervals. Args: fn (str): The results file. chunksize (int): Hits per iteration. Yields: DataFrame: Pandas DataFrme with the BLAST+ hits. ''' for group in pd.read_table(fn, header=None, skipinitialspace=True, names=[k for k, _ in blast_cols], delimiter=delimiter, chunksize=chunksize): convert_dtypes(group, dict(blast_cols)) if remap: remap_blast(group) yield group
def crb_to_df_iter(fn, chunksize=10000, remap=False): '''Iterator of DataFrames of length chunksize parsed from the results from CRBB version crb-blast 0.6.6. Args: fn (str): The results file. chunksize (int): Hits per iteration. Yields: DataFrame: Pandas DataFrame with the CRBB hits. ''' for group in pd.read_table(fn, header=None, names=[k for k, _ in crb_cols], delimiter='\t', chunksize=chunksize): convert_dtypes(group, dict(crb_cols)) qrange = group.qrange.str.partition('..') group['qstart'] = qrange[0].astype(int) group['qend'] = qrange[2].astype(int) del group['qrange'] srange = group.srange.str.partition('..') group['sstart'] = srange[0].astype(int) group['send'] = srange[2].astype(int) del group['srange'] if remap: remap_blast(group) yield group
def crb_to_df_iter(fn, chunksize=10000, remap=True): '''Iterator of DataFrames of length chunksize parsed from the results from CRBB version crb-blast 0.6.6. crb-blast is given the same treatment as BLAST+, as that's what it uses under the hood. We convert to proper 0-based, half-open intervals. Args: fn (str): The results file. chunksize (int): Hits per iteration. Yields: DataFrame: Pandas DataFrame with the CRBB hits. ''' for group in pd.read_table(fn, header=None, names=[k for k, _ in crb_cols], delimiter='\t', chunksize=chunksize): convert_dtypes(group, dict(crb_cols)) qrange = group.qrange.str.partition('..') group['qstart'] = qrange[0].astype(int) group['qend'] = qrange[2].astype(int) del group['qrange'] srange = group.srange.str.partition('..') group['sstart'] = srange[0].astype(int) group['send'] = srange[2].astype(int) del group['srange'] if remap: remap_blast(group) yield group
def blast_to_df_iter(fn, delimiter='\t', chunksize=10000, remap=False): '''Iterator of DataFrames of length chunksize parsed from an NCBI BLAST+ `-outfmt6` file. Args: fn (str): The results file. chunksize (int): Hits per iteration. Yields: DataFrame: Pandas DataFrme with the BLAST+ hits. ''' for group in pd.read_table(fn, header=None, skipinitialspace=True, names=[k for k, _ in blast_cols], delimiter=delimiter, chunksize=chunksize): convert_dtypes(group, dict(blast_cols)) if remap: remap_blast(group) yield group