Пример #1
0
description = """
This will search NCBI for domains in a protein fasta file.
"""

import sys
import time
import requests

from pythomics.templates import CustomParser

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_out()
parser.add_argument(
    "--db", help="The database to search", default="cdd", choices=["cdd", "pfam", "smart", "tigrfam", "cog", "kog"]
)


def main():
    args = parser.parse_args()
    files = {"queries": args.fasta}
    nbci_url = "http://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi?"
    response = requests.post(
        "{nbci}tdata=hits&dmode=std&db={database}&compbasedadj=0&filter=true&evalue=0.0001&cddefl=true".format(
            nbci=nbci_url, database=args.db
        ),
        files=files,
    )
    if response.status_code != 200:
        sys.stderr.write("Error interfacing with NCBI: {}".format(response.text))
Пример #2
0
description = """
This script will accept a given nucleotide fasta file and output
found ORFs. ORFs are annotated by which stop codon they are a part
of. As in, ORF 3 is annotated as the 3rd sequence if the translated
sequence is divided by stop codons. This is prevent ambiguity with
differing minimum lengths of ORFs.
"""

from pythomics.templates import CustomParser
import sys, argparse
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_fasta()
parser.add_out()
parser.add_argument('--min', help="Minimum ORF length in amino acids.", type=int, default=50)
parser.add_argument('--both-strands', help="Search both strands for ORFs.", action='store_true')
parser.add_argument('--no-met-start', help="Output ORFs starting with amino acids other than MET", action='store_true')
parser.add_argument('--from-met', help="Truncate leading amino acids up to MET", action='store_true')
parser.add_argument('--from-met-keep', help="Truncate leading amino acids up to MET, but keep the untruncated version as well.", action='store_true')


def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    no_met = args.no_met_start
    from_met = args.from_met
    from_met_keep = args.from_met_keep
Пример #3
0
    default=['Heavy/Light'])
mods = parser.add_argument_group('Modification File')
mods.add_argument(
    '--mods',
    help=
    "The modifications file (the file with sites, peptides). For multiple files, separate by spaces (must be in same order as inference).",
    nargs='+',
    type=argparse.FileType('r'),
    required=True)
mods.add_argument('--site-protein',
                  help="The mod file protein column name",
                  type=str,
                  default='Protein')

parser.add_argument('--no-log2',
                    help='Do not log2 normalize quantification values.',
                    action='store_true')
parser.add_argument(
    '--no-median',
    help=
    'Do not normalize quantification values by the median of the experiment.',
    action='store_true')
parser.add_argument(
    '--wp',
    help=
    "The whole proteome inference file, if it exists. For multiple replicates, separate by spaces.",
    nargs='+',
    type=argparse.FileType('r'))
parser.add_argument('--non-mod-norm',
                    help='Normalize the data by the non-modified peptides.',
                    action='store_true')
Пример #4
0
import sys, copy, re
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
import pythomics.proteomics.config as config
from pythomics.templates import CustomParser

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_out()
parser.add_enzyme(
    help=
    "Enzyme to use. Pass a list like \"trypsin lysc\" to use multiple enzymes.  "
    "The order of enzymes will be the order of digestion if digesting in series."
)
parser.add_argument(
    '--parallel',
    help="Should cleavages be done in parallel (default is serial digestion)?",
    action='store_true')


def main():
    args = parser.parse_args()
    digest_min = args.min
    digest_max = args.max
    enzymes = args.enzyme
    peptides_found = {}
    retained = {}
    total = 0
    proteinMap = {}
    coverageMap = {}
    aas = config.RESIDUE_MASSES.keys()
    aas.sort()
Пример #5
0
description = """
This will search NCBI for domains in a protein fasta file.
"""

import sys
import time
import requests

from pythomics.templates import CustomParser

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_out()
parser.add_argument('--db',
                    help='The database to search',
                    default='cdd',
                    choices=['cdd', 'pfam', 'smart', 'tigrfam', 'cog', 'kog'])


def main():
    args = parser.parse_args()
    files = {'queries': args.fasta}
    nbci_url = 'http://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi?'
    response = requests.post(
        '{nbci}tdata=hits&dmode=std&db={database}&compbasedadj=0&filter=true&evalue=0.0001&cddefl=true'
        .format(nbci=nbci_url, database=args.db),
        files=files)
    if response.status_code != 200:
        sys.stderr.write('Error interfacing with NCBI: {}'.format(
            response.text))
        return 1
Пример #6
0
description = """
This script will digest a given fasta file with the specified enzymes. 
Both protein and nucleotide fasta files are valid inputs, and when
digesting fasta files, it is possible to create 6 frame as well as 
3 frame translations.
"""

import argparse, sys, itertools
from pythomics.templates import CustomParser
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description)
parser.add_fasta()
parser.add_argument('-t', '--type', help="The type of fasta file (default protein).", choices=['prot','nt'], type=str, default='prot')
parser.add_argument('--frame', help="If using a nucleotide file, translate in how many frames?", choices=[1,3,6], type=int)
parser.add_argument('--genome', help="Are we translating a genome? This will keep chromosome positions in the header.", action='store_true', default=False)
parser.add_out()
parser.add_enzyme()
parser.add_argument('--unique', help="Only return unique peptides per cleavage", action='store_true', default=False)

def main():
    args = parser.parse_args()
    file_name = args.fasta
    enzyme_choice = args.enzyme
    digest_type = args.type
    digest_frame = args.frame
    digest_negative = False
    if digest_frame == 6:
        digest_negative = True
Пример #7
0
try:
    import re2 as re
except ImportError:
    import re
from multiprocessing import Pool, Value
from collections import Counter
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description = description)
parser.add_fasta(help="The fasta file to match peptides against.")
parser.add_out(help="The name of the file you wish to create with results appended.")
parser.add_argument('--peptide-out', nargs='?', help="The file to write digested products to.", type=argparse.FileType('w'), default=os.devnull)
parser.add_argument('--protein-out', nargs='?', help="The file to write grouped products to.", type=argparse.FileType('w'), default=os.devnull)
parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true')
parser.add_delimited_file(cols=['--peptide-col'], col_default='Peptide')
parser.add_argument('-r', '--regex', help="A perl regular expression determining which parts of the header to capture.", type=str)
parser.add_argument('--inferred-name', help="The name you want to assign for protein inference (in case you are regexing for gene names or something).", type=str, default='Proteins')
parser.add_argument('--no-inference', help="Do not append proteins inferred from sequences.", action='store_true')
parser.add_argument('--no-equality', help="Do not consider Leucine and Isoleucine equal for peptide mapping.", action='store_true')
ibaq_group = parser.add_argument_group('iBAQ related options')
ibaq_group.add_argument('--ibaq', help="Provide to append iBAQ values as well (requires protein inference).", action='store_true')
ibaq_group.add_argument('--precursors', help="The column with precursor area (defaults to header lines containing 'Precursor').", type=str)
parser.add_column_function('', col_argument='--ibaq-function', group=ibaq_group, col_help="The function to apply to groups of iBAQ values (for multiple peptide matches).", parent=False)
ibaq_group.add_argument('--non-redundant', help="Use only non-redundant theoretical tryptic peptides for the iBAQ denominator.", action='store_true')
parser.add_enzyme(help="The enzyme used to digest the sample.")
ibaq_group.add_argument('--normalize', help="Normalize iBAQ to total intensity of column (useful for comparing multiple samples).", action='store_true')
protein_group = parser.add_argument_group('Protein Grouping Options')
Пример #8
0
and summarize how much of the proteome is covered, what residues are missed,
and what isoforms can be uniquely identified.
"""

import sys, copy, re
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
import pythomics.proteomics.config as config
from pythomics.templates import CustomParser

parser = CustomParser(description = description)
parser.add_fasta()
parser.add_out()
parser.add_enzyme(help="Enzyme to use. Pass a command separated list (no spaces); "
                    "the order of enzymes will be the order of digestion if digesting in series.")
parser.add_argument('--parallel', help="Should cleavages be done in parallel (default is serial digestion)?", action='store_true', default=False)
parser.add_argument('--series', help="Should cleavages be done in series? (default)", action='store_true', default=True)


def main():
    args = parser.parse_args()
    digest_min = args.min
    digest_max = args.max
    enzymes = args.enzyme.split(',')
    peptides_found = {}
    retained = {}
    total = 0
    proteinMap = {}
    coverageMap = {}
    aas = config.RESIDUE_MASSES.keys()
    aas.sort()
Пример #9
0
and summarize how much of the proteome is covered, what residues are missed,
and what isoforms can be uniquely identified.
"""

import sys, copy, re
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
import pythomics.proteomics.config as config
from pythomics.templates import CustomParser

parser = CustomParser(description = description)
parser.add_fasta()
parser.add_out()
parser.add_enzyme(help="Enzyme to use. Pass a list like \"trypsin lysc\" to use multiple enzymes.  "
                    "The order of enzymes will be the order of digestion if digesting in series.")
parser.add_argument('--parallel', help="Should cleavages be done in parallel (default is serial digestion)?", action='store_true')


def main():
    args = parser.parse_args()
    digest_min = args.min
    digest_max = args.max
    enzymes = args.enzyme
    peptides_found = {}
    retained = {}
    total = 0
    proteinMap = {}
    coverageMap = {}
    aas = config.RESIDUE_MASSES.keys()
    aas.sort()
    tlen = 0
Пример #10
0
description = """
This script will annotate a tab delimited text file with peptides with
corresponding proteins present in an annotation file, and can also
use this annotation to include iBAQ measures.
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description)
parser.add_fasta(help="The fasta file to match peptides against.")
parser.add_argument('--peptide_out', nargs='?', help="The file to write digested products to.", type=argparse.FileType('w'), default=sys.stdout)
parser.add_argument('--protein_out', nargs='?', help="The file to write grouped products to.", type=argparse.FileType('w'), default=sys.stdout)
parser.add_delimited_file()
parser.add_argument('-r', '--regex', help="A perl regular expression determining which parts of the header to capture.", type=str)
parser.add_argument('--no-inference', help="Do not append proteins inferred from sequences.", action='store_false', default=False)
group = parser.add_argument_group('iBAQ related options')
group.add_argument('--ibaq', help="Provide to append iBAQ values as well (requires protein inference).", action='store_true', default=False)
group.add_argument('--precursors', help="The column with precursor area (defaults to header lines containing 'Precursor').", type=int, default=None)
parser.add_enzyme()
group.add_argument('--no-normalize', help="Don't normalize iBAQ to total intensity", action='store_false', default=True)
group.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true', default=False)
protein_group = parser.add_argument_group('Protein Grouping Options')
protein_group.add_argument('--unique-only', help="Only group proteins with unique peptides", action='store_true', default=False)
protein_group.add_argument('--position', help="Write the position of the peptide matches.", action='store_true', default=False)

def main():
Пример #11
0
description = """
This script will accept a given nucleotide fasta file and output
found ORFs. ORFs are annotated by which stop codon they are a part
of. As in, ORF 3 is annotated as the 3rd sequence if the translated
sequence is divided by stop codons. This is prevent ambiguity with
differing minimum lengths of ORFs.
"""

from pythomics.templates import CustomParser
import sys, argparse
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_fasta()
parser.add_out()
parser.add_argument('--min', help="Minimum ORF length in amino acids.", type=int, default=50)
parser.add_argument('--both-strands', help="Search both strands for ORFs.", action='store_true', default=False)

def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    with args.out as o:
        for header, sequence in fasta_file:
            for i in xrange(3):
                strand='+'
                translation = fasta._translate(sequence[i:])
                translation = translation.split('*')
                for protein_index,protein_sequence in enumerate(translation):
Пример #12
0
__author__ = 'chris'

description = """
This script will lookup features from one delimited file in another delimited file, and
perform various operations on the found entries in the alternative file
"""

import sys, csv
from pythomics.templates import CustomParser
from pythomics.utils import ColumnFunctions

parser = CustomParser(description = description)
parser.add_delimited_file(files=['-a'], delimiter=['--adelim'], cols=['--acol'], header=['--aheader'], help="This is the file to lookup values from.")
parser.add_delimited_file(files=['-b'], delimiter=['--bdelim'], cols=['--bcol'], header=['--bheader'], help="This is the file to lookup values in.")
parser.add_argument('--blookup', help='The column to take entries from in file b.', type=str, default=1)
parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true')
parser.add_out()
parser.add_argument('--function', help='The function to apply to found entries.', choices=['concat', 'mean', 'sum', 'median', 'var', 'std', 'count'], type=str, default='concat')
parser.add_argument('--colname', help='The column name to give the new appended value. Defaults to function chosen', type=str, default='')
parser.add_argument('--aregex', help='An optional regex pattern for matching columns in file a.', type=str, default='')
parser.add_argument('--bregex', help='An optional regex pattern for matching columns in file b.', type=str, default='')

def main():
    args = parser.parse_args()
    a_colname, b_colname, bl_colname = False, False, False
    try:
        a_column = int(args.acol)
        a_column = a_column-1 if a_column > 0 else a_column
    except ValueError:
        a_colname = True
Пример #13
0
found ORFs. ORFs are annotated by which stop codon they are a part
of. As in, ORF 3 is annotated as the 3rd sequence if the translated
sequence is divided by stop codons. This is prevent ambiguity with
differing minimum lengths of ORFs.
"""

from pythomics.templates import CustomParser
import sys, argparse
import pythomics.parsers.fasta as fasta

parser = CustomParser(description=description,
                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_fasta()
parser.add_out()
parser.add_argument('--min',
                    help="Minimum ORF length in amino acids.",
                    type=int,
                    default=50)
parser.add_argument('--both-strands',
                    help="Search both strands for ORFs.",
                    action='store_true')
parser.add_argument(
    '--no-met-start',
    help="Output ORFs starting with amino acids other than MET",
    action='store_true')
parser.add_argument('--from-met',
                    help="Truncate leading amino acids up to MET",
                    action='store_true')
parser.add_argument(
    '--from-met-keep',
    help=
    "Truncate leading amino acids up to MET, but keep the untruncated version as well.",
Пример #14
0
This script will digest a given fasta file with the specified enzymes. 
Both protein and nucleotide fasta files are valid inputs, and when
digesting fasta files, it is possible to create 6 frame as well as 
3 frame translations.
"""

import argparse, sys, itertools
from pythomics.templates import CustomParser
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_argument('-t',
                    '--type',
                    help="The type of fasta file (default protein).",
                    choices=['prot', 'nt'],
                    type=str,
                    default='prot')
parser.add_argument(
    '--frame',
    help="If using a nucleotide file, translate in how many frames?",
    choices=[1, 3, 6],
    type=int)
parser.add_argument(
    '--genome',
    help=
    "Are we translating a genome? This will keep chromosome positions in the header.",
    action='store_true')
parser.add_out()
parser.add_enzyme()
parser.add_argument('--unique',
Пример #15
0
Also, features can be can be grouped into longer sequences
with the --substring flag (ex: peptides LNGERPEPTIDE and ERPEPT will be merged
into LNGERPEPTIDE).
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description = description)
parser.add_delimited_file(cols=['--group-on'])
parser.add_out()
parser.add_argument('--substring', help='If set, merge features by partial matches (such as collapsing peptides into larger peptides)', action='store_true')
parser.add_column_function('--summary-col', col_help="The function to apply to grouped entries in modification columns.")
parser.add_argument('--summary-col-delimiter', help="If the summary column has a delimiter, such as a ; for multiple proteins.")
parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true')
parser.add_argument('--merge', help='Merge together identical entries.', action='store_true')
# parser.add_argument('--merge-columns', help="If set, columns of merged peptides will be combined.", action='store_true')
# parser.add_argument('--merge-delimiter', help='The delimiter for column merges.', type=str, default=';')
parser.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true')

def main():
    args = parser.parse_args()
    peptide_colname = False
    try:
        peptide_column = int(args.group_on)
        peptide_column = peptide_column-1 if peptide_column > 0 else peptide_column
    except ValueError:
Пример #16
0
This script will take a delimited file and collapse features together, such
as scan numbers. It can also be used to group peptides into longer sequences
with the --substring flag (ex: peptides LNGERPEPTIDE and ERPEPT will be merged
into LNGERPEPTIDE).
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description)
parser.add_delimited_file()
parser.add_out()
parser.add_argument('--substring', help='If set, merge features by partial matches (such as collapsing peptides into larger peptides)', action='store_true', default=False)
parser.add_argument('--merge-columns', help="If set, columns of merged peptides will be combined.", action='store_true', default=False)
parser.add_argument('--merge-delimiter', help='The delimiter for column merges.', type=str, default=';')
parser.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true', default=False)

def main():
    args = parser.parse_args()
    peptide_column = args.col-1
    tsv_file = args.tsv
    header_lines = args.header
    delimiter = args.delimiter
    peptide_join = args.substring
    col_delimiter = args.merge_delimiter
    merge_columns = args.merge_columns
    case_sens = args.case_sensitive
    peptide_history = {}
Пример #17
0
from pythomics.templates import CustomParser
from pythomics.utils import ColumnFunctions

parser = CustomParser(description=description)
parser.add_delimited_file(files=['-a'],
                          delimiter=['--adelim'],
                          cols=['--acol'],
                          header=['--aheader'],
                          help="This is the file to lookup values from.")
parser.add_delimited_file(files=['-b'],
                          delimiter=['--bdelim'],
                          cols=['--bcol'],
                          header=['--bheader'],
                          help="This is the file to lookup values in.")
parser.add_argument('--blookup',
                    help='The column to take entries from in file b.',
                    type=str,
                    default=1)
parser.add_argument(
    '--strict',
    help=
    'For numeric operations, fail if types are incorrect (converting NA to a float for instance).',
    action='store_true')
parser.add_out()
parser.add_argument(
    '--function',
    help='The function to apply to found entries.',
    choices=['concat', 'mean', 'sum', 'median', 'var', 'std', 'count'],
    type=str,
    default='concat')
parser.add_argument(
    '--colname',
Пример #18
0
    import re
from multiprocessing import Pool, Value
from collections import Counter
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description=description)
parser.add_fasta(help="The fasta file to match peptides against.")
parser.add_out(
    help="The name of the file you wish to create with results appended.")
parser.add_argument('--peptide-out',
                    nargs='?',
                    help="The file to write digested products to.",
                    type=argparse.FileType('w'),
                    default=os.devnull)
parser.add_argument('--protein-out',
                    nargs='?',
                    help="The file to write grouped products to.",
                    type=argparse.FileType('w'),
                    default=os.devnull)
parser.add_argument(
    '--strict',
    help=
    'For numeric operations, fail if types are incorrect (converting NA to a float for instance).',
    action='store_true')
parser.add_delimited_file(cols=['--peptide-col'], col_default='Peptide')
parser.add_argument(
    '-r',
Пример #19
0
into LNGERPEPTIDE).
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description=description)
parser.add_delimited_file(cols=['--group-on'])
parser.add_out()
parser.add_argument(
    '--substring',
    help=
    'If set, merge features by partial matches (such as collapsing peptides into larger peptides)',
    action='store_true')
parser.add_column_function(
    '--summary-col',
    col_help="The function to apply to grouped entries in modification columns."
)
parser.add_argument(
    '--summary-col-delimiter',
    help=
    "If the summary column has a delimiter, such as a ; for multiple proteins."
)
parser.add_argument(
    '--strict',
    help=
    'For numeric operations, fail if types are incorrect (converting NA to a float for instance).',
Пример #20
0
import argparse

from pythomics.templates import CustomParser

parser = CustomParser(description=description)
group = parser.add_argument_group('Protein Inference File')
group.add_argument('--inference', help="The protein inference file (your peptide file with gene/protein annotations). For multiple files, separate by spaces (must be in same order as mods).", nargs='+', type=argparse.FileType('r'), required=True)
group.add_argument('--gene', help="The Gene column name", type=str, default='Gene')
group.add_argument('--protein', help="The Protein column name", type=str, default='Protein')
group.add_argument('--peptide', help="The Peptide column name", type=str, default='Peptide')
group.add_argument('--quant', help="The name of quantification columns (such as Heavy/Light). Separate multiple columns by spaces", nargs='+', default=['Heavy/Light'])
mods = parser.add_argument_group('Modification File')
mods.add_argument('--mods', help="The modifications file (the file with sites, peptides). For multiple files, separate by spaces (must be in same order as inference).", nargs='+', type=argparse.FileType('r'), required=True)
mods.add_argument('--site-protein', help="The mod file protein column name", type=str, default='Protein')

parser.add_argument('--no-log2', help='Do not log2 normalize quantification values.', action='store_true')
parser.add_argument('--no-median', help='Do not normalize quantification values by the median of the experiment.', action='store_true')
parser.add_argument('--wp', help="The whole proteome inference file, if it exists. For multiple replicates, separate by spaces.", nargs='+', type=argparse.FileType('r'))
parser.add_argument('--non-mod-norm', help='Normalize the data by the non-modified peptides.', action='store_true')

parser.add_argument('--site-file', help='The output path for the file with sumamries at the site level.', default=sys.stdout, type=argparse.FileType('wb'))
parser.add_argument('--peptide-file', help='The output path for the file with sumamries at the site and peptide level.', default=sys.stdout, type=argparse.FileType('wb'))


def main():
    args = parser.parse_args()
    inference_files = args.inference
    mod_files = args.mods
    wp_files = args.wp if args.wp else []
    quant_cols = args.quant
    gene_col = args.gene
Пример #21
0
This script will trim N's from the ends of a fasta/fastq file so it can be
aligned by tophat (which pukes if there are >5 N's. We remove them from the read
ends only)
"""

import sys, re, os, gzip
from itertools import izip
from multiprocessing import Pool
from pythomics.templates import CustomParser
parser = CustomParser(description=description)
parser.add_fasta()
parser.add_read_pair()
parser.add_out()
parser.add_argument(
    '--min-len',
    help="The minimum read length reads must be after trimming.",
    type=int,
    default=25)
parser.add_argument('--prefix',
                    help="If using paired reads, this is the filename prefix.",
                    type=str)
parser.add_argument('--quality',
                    help='If provided, remove qualities below a given score.',
                    type=int,
                    default=0)
parser.add_argument('--chunk',
                    help='How many reads to submit to each core.',
                    type=int,
                    default=1000)
parser.add_argument('--no-gzip',
                    help='To disable compression with gzip.',
Пример #22
0
description = """
This script will trim N's from the ends of a fasta/fastq file so it can be
aligned by tophat (which pukes if there are >5 N's. We remove them from the read
ends only)
"""

import sys, re, os, gzip
from itertools import izip
from multiprocessing import Pool
from pythomics.templates import CustomParser
parser = CustomParser(description = description)
parser.add_fasta()
parser.add_read_pair()
parser.add_out()
parser.add_argument('--min-len', help="The minimum read length reads must be after trimming.", type=int, default=25)
parser.add_argument('--prefix', help="If using paired reads, this is the filename prefix.", type=str)
parser.add_argument('--quality', help='If provided, remove qualities below a given score.', type=int, default=0)
parser.add_argument('--chunk', help='How many reads to submit to each core.', type=int, default=1000)
parser.add_argument('--no-gzip', help='To disable compression with gzip.', action='store_false')
# parser.add_argument('--5partial-match', help='This will trim partial matches at the 3\' end of the sequence if there is a match of at least x nucleotides.', type=int, default=0)
# parser.add_argument('--seed-length', help='The seed length for a match.', type=int, default=0)
# parser.add_argument('--mismatches', help='The number of possible mismatches in a sequence.', type=int, default=3)

start_trim = re.compile(r'^N+')
end_trim = re.compile(r'N+$')
global quality_min
global quality_offset
global paired
global read_min
quality_min = 0