def test_can_find_chrom_using_legacy_argument_name(self): line = '1\t100\tA\tC\t1' special_parser = parsers.GenericGwasLineParser(chr_col=1, pos_col=2, ref_col=3, alt_col=4, pvalue_col=5, is_neg_log_pvalue=True, delimiter='\t') p = special_parser(line) assert p.chrom == '1'
def standard_gwas_parser_basic(): return parsers.GenericGwasLineParser(chrom_col=1, pos_col=2, ref_col=3, alt_col=4, pvalue_col=5, is_neg_log_pvalue=True, delimiter='\t')
def test_can_convert_to_neglogpvalue(self): line = '1\t100\tA\tC\t1' special_parser = parsers.GenericGwasLineParser(chrom_col=1, pos_col=2, ref_col=3, alt_col=4, pvalue_col=5, is_neg_log_pvalue=True, delimiter='\t') p = special_parser(line) assert p.neg_log_pvalue == pytest.approx(1), 'Converts -log to pvalue' assert p.pvalue == pytest.approx(0.1), 'Converts -log to pvalue'
def test_can_convert_to_logpvalue_using_legacy_argument_names(self): line = '1\t100\tA\tC\t1' special_parser = parsers.GenericGwasLineParser(chrom_col=1, pos_col=2, ref_col=3, alt_col=4, pval_col=5, is_log_pval=True, delimiter='\t') p = special_parser(line) assert p.neg_log_pvalue == pytest.approx(1), 'Parses -logp as is' assert p.pvalue == pytest.approx(0.1), 'Converts -log to pvalue'
def test_gets_marker_info_from_hybrid_fields(self): line = 'chr2:100_NA_NA\tA\tC\t.05' special_parser = parsers.GenericGwasLineParser(marker_col=1, ref_col=2, alt_col=3, pval_col=4) p = special_parser(line) assert p.chrom == '2', 'Read chrom from marker' assert p.pos == 100, 'Read pos from marker' assert p.ref == 'A', 'Read ref from column and ignored marker value' assert p.alt == 'C', 'Read alt from column and ignored marker value'
def test_warns_about_incorrect_delimiter(self): """ Regression test: human-edited files may have a mix of tabs and spaces; this is hard to spot! """ line = 'chr2:100:A:C_anno\t.05' special_parser = parsers.GenericGwasLineParser(marker_col=1, pvalue_col=2, delimiter=' ') with pytest.raises(exceptions.LineParseException, match="delimiter"): special_parser(line)
def test_parses_marker_to_clean_format(self): line = 'chr2:100:A:C_anno\t.05' special_parser = parsers.GenericGwasLineParser(marker_col=1, pvalue_col=2, delimiter='\t') p = special_parser(line) assert p.chrom == '2', 'Finds chromosome' assert p.pos == 100, 'Finds position' assert p.ref == 'A', 'Finds ref' assert p.alt == 'C', 'Finds alt' assert p.marker == '2:100_A/C', 'Turns a messy marker into a cleaned standardized format'
def test_parses_rsid_to_clean_format(self): scenarios = [ ('chrx\t100\t.05\trs12', 'rs12'), # Handles valid rsid as given ('chrx\t100\t.05\tNA', None), # Missing values ('chrx\t100\t.05\t99', 'rs99'), # Ensures prefix is present ] parser = parsers.GenericGwasLineParser(chrom_col=1, pos_col=2, pvalue_col=3, rsid_col=4, delimiter='\t') for text, expected in scenarios: actual = parser(text).rsid assert actual == expected, 'Found correct rsid from: {}'.format(text)
def standard_gwas_parser(): return parsers.GenericGwasLineParser(chrom_col=1, pos_col=2, ref_col=3, alt_col=4, pvalue_col=5, is_neg_log_pvalue=True, beta_col=6, stderr_beta_col=7, allele_freq_col=8, is_alt_effect=True, delimiter='\t')
def main(source: ty.Union[str, ty.Iterable], out_fn: ty.Union[str, None], parser_options: dict, auto_config=False, skip_rows=None, skip_errors=True, max_errors=100, make_tabix: bool = False): try: parser = parsers.GenericGwasLineParser(**parser_options) except exceptions.ConfigurationException: parser = None if source is None: source = sys.stdin if not auto_config and (skip_rows is None or parser is None): logger.error( 'Please provide all options required to parse the file, or use the --auto flag to guess' ) sys.exit(1) # Guess how to read the file. If no parser was provided, try to guess columns. reader = sniffers.guess_gwas_generic(source, skip_rows=skip_rows, parser=parser, parser_options=parser_options, skip_errors=skip_errors, max_errors=max_errors) try: dest_fn = reader.write(out_fn, make_tabix=make_tabix) or 'console' except exceptions.TooManyBadLinesException: logger.error('ERROR: Too many lines failed to parse; stopping.') except Exception: logger.exception('Conversion failed due to unknown error') else: logger.info( 'Conversion succeeded! Results written to: {}'.format(dest_fn)) finally: for n, reason, _ in reader.errors: logger.error( 'Excluded row {} from output due to parse error: {}'.format( n, reason))
def test_validates_frequency_fields(self): with pytest.raises(exceptions.ConfigurationException, match='mutually exclusive'): parsers.GenericGwasLineParser(marker_col=1, pvalue_col=2, allele_count_col=3, allele_freq_col=4) with pytest.raises(exceptions.ConfigurationException, match='n_samples'): parsers.GenericGwasLineParser(marker_col=1, pvalue_col=2, allele_count_col=3, n_samples_col=None)
def test_validates_that_allele_spec_is_none_or_both(self): with pytest.raises(exceptions.ConfigurationException, match='all required'): parsers.GenericGwasLineParser(marker_col=1, ref_col=3, pvalue_col=None)
def test_validates_arguments_required_fields(self): with pytest.raises(exceptions.ConfigurationException, match='all required'): parsers.GenericGwasLineParser(marker_col=1, pvalue_col=None)
def test_parses_freq_from_freq(self): line = 'chr2:100:A:C_anno\t.05\t0.25' special_parser = parsers.GenericGwasLineParser(marker_col=1, pvalue_col=2, allele_freq_col=3, is_alt_effect=True) p = special_parser(line) assert p.alt_allele_freq == 0.25, "Parses frequency as is"
def test_parses_freq_from_counts(self): line = 'chr2:100:A:C_anno\t.05\t25\t100' special_parser = parsers.GenericGwasLineParser(marker_col=1, pvalue_col=2, allele_count_col=3, n_samples_col=4, is_alt_effect=False) p = special_parser(line) assert p.alt_allele_freq == 0.875, "Calculates frequency from counts and orients to alt allele"
def test_parses_chr_to_clean_format(self): line = 'chrx\t100\t.05' special_parser = parsers.GenericGwasLineParser(chrom_col=1, pos_col=2, pvalue_col=3, delimiter='\t') p = special_parser(line) assert p.chrom == 'X', 'Strips prefix from chromosome labels and always uses uppercase letters'