Exemplo n.º 1
0
 def set_meta( self, dataset, **kwd ):
     Text.set_meta(self, dataset, **kwd )
     data_dir = dataset.extra_files_path
     ## search data_dir/genome_version for files
     regulation_pattern = 'regulation_(.+).bin'
     #  annotation files that are included in snpEff by a flag
     annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'}
     regulations = []
     annotations = []
     if data_dir and os.path.isdir(data_dir):
         for root, dirs, files in os.walk(data_dir):
             for fname in files:
                 if fname.startswith('snpEffectPredictor'):
                     # if snpEffectPredictor.bin download succeeded
                     genome_version = os.path.basename(root)
                     dataset.metadata.genome_version = genome_version
                 else:
                     m = re.match(regulation_pattern,fname)
                     if m:
                         name = m.groups()[0]
                         regulations.append(name)
                     elif fname in annotations_dict:
                         value = annotations_dict[fname]
                         name = value.lstrip('-')
                         annotations.append(name)
         dataset.metadata.regulation = regulations
         dataset.metadata.annotation = annotations
Exemplo n.º 2
0
 def __init__(self, **kwd):
     Text.__init__(self, **kwd)
     self.add_composite_file(
         "%s.grp", description="Group File", substitute_name_with_metadata="reference_name", is_binary=False
     )
     self.add_composite_file(
         "%s.ti", description="", substitute_name_with_metadata="reference_name", is_binary=False
     )
Exemplo n.º 3
0
 def __init__(self, **kwd):
     Text.__init__(self, **kwd)
     self.add_composite_file('%s.gz',
                             description='dbNSFP bgzip',
                             substitute_name_with_metadata='reference_name',
                             is_binary=True)
     self.add_composite_file('%s.gz.tbi',
                             description='Tabix Index File',
                             substitute_name_with_metadata='reference_name',
                             is_binary=True)
Exemplo n.º 4
0
 def __init__(self, **kwd):
     Text.__init__(self, **kwd)
     self.add_composite_file('%s.grp',
                             description='Group File',
                             substitute_name_with_metadata='reference_name',
                             is_binary=False)
     self.add_composite_file('%s.ti',
                             description='',
                             substitute_name_with_metadata='reference_name',
                             is_binary=False)
Exemplo n.º 5
0
 def set_meta(self, dataset, **kwd):
     Text.set_meta(self, dataset, **kwd)
     data_dir = dataset.extra_files_path
     # search data_dir/genome_version for files
     regulation_pattern = 'regulation_(.+).bin'
     #  annotation files that are included in snpEff by a flag
     annotations_dict = {
         'nextProt.bin': '-nextprot',
         'motif.bin': '-motif',
         'interactions.bin': '-interaction'
     }
     regulations = []
     annotations = []
     genome_version = None
     snpeff_version = None
     if data_dir and os.path.isdir(data_dir):
         for root, dirs, files in os.walk(data_dir):
             for fname in files:
                 if fname.startswith('snpEffectPredictor'):
                     # if snpEffectPredictor.bin download succeeded
                     genome_version = os.path.basename(root)
                     dataset.metadata.genome_version = genome_version
                     # read the first line of the gzipped snpEffectPredictor.bin file to get the SnpEff version
                     snpeff_version = self.getSnpeffVersionFromFile(
                         os.path.join(root, fname))
                     if snpeff_version:
                         dataset.metadata.snpeff_version = snpeff_version
                 else:
                     m = re.match(regulation_pattern, fname)
                     if m:
                         name = m.groups()[0]
                         regulations.append(name)
                     elif fname in annotations_dict:
                         value = annotations_dict[fname]
                         name = value.lstrip('-')
                         annotations.append(name)
         dataset.metadata.regulation = regulations
         dataset.metadata.annotation = annotations
         try:
             with open(dataset.file_name, 'w') as fh:
                 fh.write(
                     "%s\n" %
                     genome_version if genome_version else 'Genome unknown')
                 fh.write("%s\n" % snpeff_version
                          if snpeff_version else 'SnpEff version unknown')
                 if annotations:
                     fh.write("annotations: %s\n" % ','.join(annotations))
                 if regulations:
                     fh.write("regulations: %s\n" % ','.join(regulations))
         except Exception:
             pass
Exemplo n.º 6
0
 def merge(split_files, output_file):
     """Merging multiple MIRA files is non-trivial and may not be possible..."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no MIRA, %r, to merge into %s" \
                          % (split_files, output_file))
     raise NotImplementedError("Merging MIRA Assembly Files has not been implemented")
Exemplo n.º 7
0
 def set_meta( self, dataset, **kwd ):
     Text.set_meta(self, dataset, **kwd )
     data_dir = dataset.extra_files_path
     # search data_dir/genome_version for files
     regulation_pattern = 'regulation_(.+).bin'
     #  annotation files that are included in snpEff by a flag
     annotations_dict = {'nextProt.bin' : '-nextprot', 'motif.bin': '-motif'}
     regulations = []
     annotations = []
     genome_version = None
     snpeff_version = None
     if data_dir and os.path.isdir(data_dir):
         for root, dirs, files in os.walk(data_dir):
             for fname in files:
                 if fname.startswith('snpEffectPredictor'):
                     # if snpEffectPredictor.bin download succeeded
                     genome_version = os.path.basename(root)
                     dataset.metadata.genome_version = genome_version
                     # read the first line of the gzipped snpEffectPredictor.bin file to get the SnpEff version
                     snpeff_version = self.getSnpeffVersionFromFile(os.path.join(root, fname))
                     if snpeff_version:
                         dataset.metadata.snpeff_version = snpeff_version
                 else:
                     m = re.match(regulation_pattern, fname)
                     if m:
                         name = m.groups()[0]
                         regulations.append(name)
                     elif fname in annotations_dict:
                         value = annotations_dict[fname]
                         name = value.lstrip('-')
                         annotations.append(name)
         dataset.metadata.regulation = regulations
         dataset.metadata.annotation = annotations
         try:
             fh = file(dataset.file_name, 'w')
             fh.write("%s\n" % genome_version if genome_version else 'Genome unknown')
             fh.write("%s\n" % snpeff_version if snpeff_version else 'SnpEff version unknown')
             if annotations:
                 fh.write("annotations: %s\n" % ','.join(annotations))
             if regulations:
                 fh.write("regulations: %s\n" % ','.join(regulations))
             fh.close()
         except:
             pass
Exemplo n.º 8
0
 def merge(split_files, output_file):
     """Merging multiple MIRA files is non-trivial and may not be possible..."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no MIRA, %r, to merge into %s" \
                          % (split_files, output_file))
     raise NotImplementedError(
         "Merging MIRA Assembly Files has not been implemented")
Exemplo n.º 9
0
 def set_meta(self, dataset, **kwd):
     Text.set_meta(self, dataset, **kwd)
     data_dir = dataset.extra_files_path
     ## search data_dir/genome_version for files
     regulation_pattern = "regulation_(.+).bin"
     #  annotation files that are included in snpEff by a flag
     annotations_dict = {"nextProt.bin": "-nextprot", "motif.bin": "-motif"}
     regulations = []
     annotations = []
     if data_dir and os.path.isdir(data_dir):
         for root, dirs, files in os.walk(data_dir):
             for fname in files:
                 if fname.startswith("snpEffectPredictor"):
                     # if snpEffectPredictor.bin download succeeded
                     genome_version = os.path.basename(root)
                     dataset.metadata.genome_version = genome_version
                 else:
                     m = re.match(regulation_pattern, fname)
                     if m:
                         name = m.groups()[0]
                         regulations.append(name)
                     elif fname in annotations_dict:
                         value = annotations_dict[fname]
                         name = value.lstrip("-")
                         annotations.append(name)
         dataset.metadata.regulation = regulations
         dataset.metadata.annotation = annotations
         try:
             fh = file(dataset.file_name, "w")
             fh.write("%s\n" % genome_version)
             if annotations:
                 fh.write("annotations: %s\n" % ",".join(annotations))
             if regulations:
                 fh.write("regulations: %s\n" % ",".join(regulations))
             fh.close()
         except:
             pass
Exemplo n.º 10
0
 def merge(split_files, output_file):
     """
     Merging fps files requires merging the header manually.
     We take the header from the first file.
     """
     if len(split_files) == 1:
         # For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("No fps files given, %r, to merge into %s" %
                          (split_files, output_file))
     with open(output_file, "w") as out:
         first = True
         for filename in split_files:
             with open(filename) as handle:
                 for line in handle:
                     if line.startswith('#'):
                         if first:
                             out.write(line)
                     else:
                         # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False
                         first = False
                         out.write(line)
Exemplo n.º 11
0
 def merge(split_files, output_file):
     """
     Merging CML files.
     """
     if len(split_files) == 1:
         # For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no CML files, %r, to merge into %s" %
                          (split_files, output_file))
     with open(output_file, "w") as out:
         for filename in split_files:
             with open(filename) as handle:
                 header = handle.readline()
                 if not header:
                     raise ValueError("CML file %s was empty" % filename)
                 if not header.lstrip().startswith('<?xml version="1.0"?>'):
                     out.write(header)
                     raise ValueError("%s is not a valid XML file!" %
                                      filename)
                 line = handle.readline()
                 header += line
                 if not line.lstrip().startswith(
                         '<cml xmlns="http://www.xml-cml.org/schema'):
                     out.write(header)
                     raise ValueError("%s is not a CML file!" % filename)
                 molecule_found = False
                 for line in handle.readlines():
                     # We found two required header lines, the next line should start with <molecule >
                     if line.lstrip().startswith('</cml>'):
                         continue
                     if line.lstrip().startswith('<molecule'):
                         molecule_found = True
                     if molecule_found:
                         out.write(line)
         out.write("</cml>\n")
Exemplo n.º 12
0
 def init_meta(self, dataset, copy_from=None):
     Text.init_meta(self, dataset, copy_from=copy_from)
Exemplo n.º 13
0
 def merge(split_files, output_file):
     """Merging multiple XML files is non-trivial and must be done in subclasses."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no BLAST XML files, %r, to merge into %s" \
                          % (split_files, output_file))
     out = open(output_file, "w")
     h = None
     for f in split_files:
         if not os.path.isfile(f):
             log.warning("BLAST XML file %s missing, retry in 1s..." % f)
             sleep(1)
         if not os.path.isfile(f):
             log.error("BLAST XML file %s missing" % f)
             raise ValueError("BLAST XML file %s missing" % f)
         h = open(f)
         header = h.readline()
         if not header:
             out.close()
             h.close()
             #Retry, could be transient error with networked file system...
             log.warning("BLAST XML file %s empty, retry in 1s..." % f)
             sleep(1)
             h = open(f)
             header = h.readline()
             if not header:
                 log.error("BLAST XML file %s was empty" % f)
                 raise ValueError("BLAST XML file %s was empty" % f)
         if header.strip() != '<?xml version="1.0"?>':
             out.write(header)  #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not an XML file!" % f)
         line = h.readline()
         header += line
         if line.strip() not in [
                 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
                 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">'
         ]:
             out.write(header)  #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file!" % f)
         while True:
             line = h.readline()
             if not line:
                 out.write(header)  #for diagnosis
                 out.close()
                 h.close()
                 raise ValueError("BLAST XML file %s ended prematurely" % f)
             header += line
             if "<Iteration>" in line:
                 break
             if len(header) > 10000:
                 #Something has gone wrong, don't load too much into memory!
                 #Write what we have to the merged file for diagnostics
                 out.write(header)
                 out.close()
                 h.close()
                 raise ValueError(
                     "BLAST XML file %s has too long a header!" % f)
         if "<BlastOutput>" not in header:
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file:\n%s\n..." %
                              (f, header))
         if f == split_files[0]:
             out.write(header)
             old_header = header
         elif old_header[:300] != header[:300]:
             #Enough to check <BlastOutput_program> and <BlastOutput_version> match
             out.close()
             h.close()
             raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
                              % (split_files[0], f, old_header[:300], header[:300]))
         else:
             out.write("    <Iteration>\n")
         for line in h:
             if "</BlastOutput_iterations>" in line:
                 break
             #TODO - Increment <Iteration_iter-num> and if required automatic query names
             #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
             out.write(line)
         h.close()
     out.write("  </BlastOutput_iterations>\n")
     out.write("</BlastOutput>\n")
     out.close()
Exemplo n.º 14
0
 def __init__(self, **kwd):
     Text.__init__(self, **kwd)
     self.max_lines = 10
Exemplo n.º 15
0
 def __init__( self, **kwd ):
     Text.__init__( self, **kwd )
     self.add_composite_file( '%s.grp', description = 'Group File', substitute_name_with_metadata = 'reference_name', is_binary = False )
     self.add_composite_file( '%s.ti', description = '', substitute_name_with_metadata = 'reference_name', is_binary = False )
Exemplo n.º 16
0
 def init_meta( self, dataset, copy_from=None ):
     Text.init_meta( self, dataset, copy_from=copy_from )
Exemplo n.º 17
0
 def __init__( self, **kwd ):
     Text.__init__( self, **kwd )
Exemplo n.º 18
0
 def __init__(self, **kwd):
     Text.__init__(self, **kwd)
     self.max_lines = 10
Exemplo n.º 19
0
 def merge(split_files, output_file):
     """Merging multiple XML files is non-trivial and must be done in subclasses."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no BLAST XML files, %r, to merge into %s" \
                          % (split_files, output_file))
     out = open(output_file, "w")
     h = None
     for f in split_files:
         if not os.path.isfile(f):
             log.warning("BLAST XML file %s missing, retry in 1s..." % f)
             sleep(1)
         if not os.path.isfile(f):
             log.error("BLAST XML file %s missing" % f)
             raise ValueError("BLAST XML file %s missing" % f)
         h = open(f)
         body = False
         header = h.readline()
         if not header:
             out.close()
             h.close()
             #Retry, could be transient error with networked file system...
             log.warning("BLAST XML file %s empty, retry in 1s..." % f)
             sleep(1)
             h = open(f)
             header = h.readline()
             if not header:
                 log.error("BLAST XML file %s was empty" % f)
                 raise ValueError("BLAST XML file %s was empty" % f)
         if header.strip() != '<?xml version="1.0"?>':
             out.write(header) #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not an XML file!" % f)
         line = h.readline()
         header += line
         if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
                                 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
             out.write(header) #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file!" % f)
         while True:
             line = h.readline()
             if not line:
                 out.write(header) #for diagnosis
                 out.close()
                 h.close()
                 raise ValueError("BLAST XML file %s ended prematurely" % f)
             header += line
             if "<Iteration>" in line:
                 break
             if len(header) > 10000:
                 #Something has gone wrong, don't load too much into memory!
                 #Write what we have to the merged file for diagnostics
                 out.write(header)
                 out.close()
                 h.close()
                 raise ValueError("BLAST XML file %s has too long a header!" % f)
         if "<BlastOutput>" not in header:
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
         if f == split_files[0]:
             out.write(header)
             old_header = header
         elif old_header[:300] != header[:300]:
             #Enough to check <BlastOutput_program> and <BlastOutput_version> match
             out.close()
             h.close()
             raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
                              % (split_files[0], f, old_header[:300], header[:300]))
         else:
             out.write("    <Iteration>\n")
         for line in h:
             if "</BlastOutput_iterations>" in line:
                 break
             #TODO - Increment <Iteration_iter-num> and if required automatic query names
             #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
             out.write(line)
         h.close()
     out.write("  </BlastOutput_iterations>\n")
     out.write("</BlastOutput>\n")
     out.close()
Exemplo n.º 20
0
 def __init__( self, **kwd ):
     Text.__init__( self, **kwd )
     self.add_composite_file( '%s.gz', description='dbNSFP bgzip', substitute_name_with_metadata='reference_name', is_binary=True )
     self.add_composite_file( '%s.gz.tbi', description='Tabix Index File', substitute_name_with_metadata='reference_name', is_binary=True )
Exemplo n.º 21
0
 def __init__(self, **kwd):
     Text.__init__(self, **kwd)