Exemplo n.º 1
0
 def _open(x, f, writer=writer, out_file=out_file):
     if out_file is not None and x == 'pass':
         handle = open(out_file, 'w')
     else:
         handle = getOutputHandle(aligner_file,
                                  out_label='db-%s' % x,
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
     return handle, writer(handle, fields=f)
Exemplo n.º 2
0
def mergeDbFiles(db_files,
                 drop=False,
                 out_file=None,
                 out_args=default_out_args):
    """
    Updates field and value pairs to a database file

    Arguments:
      db_files : list of database file names.
      drop : if True drop columns not present in all files.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str : output file name.
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'merge'
    log['FILES'] = ','.join([os.path.basename(f) for f in db_files])
    log['DROP'] = drop
    printLog(log)

    # Open input
    db_handles = [open(f, 'rt') for f in db_files]
    db_iters = [TSVReader(x) for x in db_handles]
    result_count = sum([countDbFile(f) for f in db_files])

    # Define output fields
    field_list = [x.fields for x in db_iters]
    if drop:
        field_set = set.intersection(*map(set, field_list))
    else:
        field_set = set.union(*map(set, field_list))
    field_order = OrderedDict([(f, None) for f in chain(*field_list)])
    out_fields = [f for f in field_order if f in field_set]

    # Open output file
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        __, __, out_args['out_type'] = splitName(db_files[0])
        pass_handle = getOutputHandle(db_files[0],
                                      out_label='parse-merge',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for db in db_iters:
        for rec in db:
            # Print progress for previous iteration
            printProgress(rec_count, result_count, 0.05, start_time=start_time)
            rec_count += 1

            # Write records
            pass_writer.writeDict(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    for x in db_handles:
        x.close()

    return pass_handle.name
Exemplo n.º 3
0
def updateDbFile(db_file,
                 field,
                 values,
                 updates,
                 out_file=None,
                 out_args=default_out_args):
    """
    Updates field and value pairs to a database file

    Arguments:
      db_file : the database file name.
      field : the field to update.
      values : a list of values to specifying which rows to update.
      updates : a list of values to update each value with.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str : output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'update'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['VALUES'] = ','.join(values)
    log['UPDATES'] = ','.join(updates)
    printLog(log)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    out_fields = db_iter.fields
    __, __, out_args['out_type'] = splitName(db_file)

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file,
                                      out_label='parse-update',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count, pass_count = 0, 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1

        # Updated values if found
        for x, y in zip(values, updates):
            if rec[field] == x:
                rec[field] = y
                pass_count += 1

        # Write records
        pass_writer.writeDict(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['UPDATED'] = pass_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 4
0
def sortDbFile(db_file,
               field,
               numeric=False,
               descend=False,
               out_file=None,
               out_args=default_out_args):
    """
    Sorts records by values in an annotation field

    Arguments:
      db_file : the database filename
      field : the field name to sort by
      numeric : if True sort field numerically;
                if False sort field alphabetically
      descend : if True sort in descending order;
                if False sort in ascending order
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs

    Returns:
      str : output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'sort'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['NUMERIC'] = numeric
    printLog(log)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    out_fields = db_iter.fields
    __, __, out_args['out_type'] = splitName(db_file)

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file,
                                      out_label='parse-sort',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Store all records in a dictionary
    start_time = time()
    printMessage("Indexing: Running", start_time=start_time)
    db_dict = {i: r for i, r in enumerate(db_iter)}
    result_count = len(db_dict)

    # Sort db_dict by field values
    tag_dict = {k: v[field] for k, v in db_dict.items()}
    if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()}
    sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend)
    printMessage("Indexing: Done", start_time=start_time, end=True)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for key in sorted_keys:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1

        # Write records
        pass_writer.writeDict(db_dict[key])

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 5
0
def selectDbFile(db_file,
                 fields,
                 values,
                 logic='any',
                 regex=False,
                 out_file=None,
                 out_args=default_out_args):
    """
    Selects records from a database file

    Arguments:
      db_file : the database file name
      fields : a list of fields to check for selection criteria
      values : a list of values defining selection targets
      logic : one of 'any' or 'all' defining whether one or all fields must have a match.
      regex : if False do exact full string matches; if True allow partial regex matches.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs

    Returns:
      str : output file name.
    """
    # Define string match function
    if regex:

        def _match_func(x, patterns):
            return any([re.search(p, x) for p in patterns])
    else:

        def _match_func(x, patterns):
            return x in patterns

    # Define logic function
    if logic == 'any':
        _logic_func = any
    elif logic == 'all':
        _logic_func = all

    # Print console log
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'select'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    log['VALUES'] = ','.join(values)
    log['REGEX'] = regex
    printLog(log)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    out_fields = db_iter.fields
    __, __, out_args['out_type'] = splitName(db_file)

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file,
                                      out_label='parse-select',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count, pass_count, fail_count = 0, 0, 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1

        # Check for selection values in all fields
        select = _logic_func(
            [_match_func(rec.get(f, False), values) for f in fields])

        # Write sequences
        if select:
            pass_count += 1
            pass_writer.writeDict(rec)
        else:
            fail_count += 1

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['SELECTED'] = pass_count
    log['DISCARDED'] = fail_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 6
0
def renameDbFile(db_file,
                 fields,
                 names,
                 out_file=None,
                 out_args=default_out_args):
    """
    Renames fields in a database file

    Arguments:
      db_file : the database file name.
      fields : a list of fields to rename.
      values : a list of new names for fields.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str : output file name.
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'rename'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    log['NAMES'] = ','.join(names)
    printLog(log)

    # Open file handles
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    __, __, out_args['out_type'] = splitName(db_file)

    # Get header and rename fields
    out_fields = list(db_iter.fields)
    for f, n in zip(fields, names):
        i = out_fields.index(f)
        out_fields[i] = n

    # Open writer
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file,
                                      out_label='parse-rename',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1
        # TODO:  repeating renaming is unnecessary.
        # Rename fields
        for f, n in zip(fields, names):
            rec[n] = rec.pop(f)
        # Write
        pass_writer.writeDict(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 7
0
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args):
    """
    Divides a tab-delimited database file into segments by description tags

    Arguments:
      db_file : filename of the tab-delimited database file to split
      field : the field name by which to split db_file
      num_split : the numerical threshold by which to group sequences;
                  if None treat field as textual
      out_args : common output argument dictionary from parseCommonArgs

    Returns:
      list : a list of output file names.
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'split'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['NUM_SPLIT'] = num_split
    printLog(log)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    out_fields = db_iter.fields
    __, __, out_args['out_type'] = splitName(db_file)

    # Determine total numbers of records
    rec_count = countDbFile(db_file)

    start_time = time()
    count = 0
    # Sort records into files based on textual field
    if num_split is None:
        # Create set of unique field tags
        with open(db_file, 'rt') as tmp_handle:
            tmp_iter = TSVReader(tmp_handle)
            tag_list = list(set([row[field] for row in tmp_iter]))

        # Forbidden characters in filename and replacements
        no_good = {
            '\/': 'f',
            '\\': 'b',
            '?': 'q',
            '\%': 'p',
            '*': 's',
            ':': 'c',
            '\|': 'pi',
            '\"': 'dq',
            '\'': 'sq',
            '<': 'gt',
            '>': 'lt',
            ' ': '_'
        }
        # Replace forbidden characters in tag_list
        tag_dict = {}
        for tag in tag_list:
            for c, r in no_good.items():
                tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \
                                 if c in tag else tag_dict.get(tag, tag))

        # Create output handles
        handles_dict = {
            tag: getOutputHandle(db_file,
                                 out_label='%s-%s' % (field, label),
                                 out_name=out_args['out_name'],
                                 out_dir=out_args['out_dir'],
                                 out_type=out_args['out_type'])
            for tag, label in tag_dict.items()
        }

        # Create Db writer instances
        writers_dict = {
            tag: TSVWriter(handles_dict[tag], fields=out_fields)
            for tag in tag_dict
        }

        # Iterate over records
        for row in db_iter:
            printProgress(count, rec_count, 0.05, start_time=start_time)
            count += 1
            # Write row to appropriate file
            tag = row[field]
            writers_dict[tag].writeDict(row)

    # Sort records into files based on numeric num_split
    else:
        num_split = float(num_split)

        # Create output handles
        handles_dict = {
            'under':
            getOutputHandle(db_file,
                            out_label='under-%.1f' % num_split,
                            out_name=out_args['out_name'],
                            out_dir=out_args['out_dir'],
                            out_type=out_args['out_type']),
            'atleast':
            getOutputHandle(db_file,
                            out_label='atleast-%.1f' % num_split,
                            out_name=out_args['out_name'],
                            out_dir=out_args['out_dir'],
                            out_type=out_args['out_type'])
        }

        # Create Db writer instances
        writers_dict = {
            'under': TSVWriter(handles_dict['under'], fields=out_fields),
            'atleast': TSVWriter(handles_dict['atleast'], fields=out_fields)
        }

        # Iterate over records
        for row in db_iter:
            printProgress(count, rec_count, 0.05, start_time=start_time)
            count += 1
            tag = row[field]
            tag = 'under' if float(tag) < num_split else 'atleast'
            writers_dict[tag].writeDict(row)

    # Write log
    printProgress(count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, k in enumerate(handles_dict):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
    log['RECORDS'] = rec_count
    log['PARTS'] = len(handles_dict)
    log['END'] = 'ParseDb'
    printLog(log)

    # Close output file handles
    db_handle.close()
    for t in handles_dict:
        handles_dict[t].close()

    return [handles_dict[t].name for t in handles_dict]
Exemplo n.º 8
0
def dropDbFile(db_file, fields, out_file=None, out_args=default_out_args):
    """
    Deletes entire fields from a database file

    Arguments:
      db_file : the database file name.
      fields : a list of fields to drop.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs

    Returns:
     str : output file name.
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'add'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    printLog(log)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    __, __, out_args['out_type'] = splitName(db_file)

    # Exclude dropped field from output
    out_fields = [f for f in db_iter.fields if f not in fields]

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file,
                                      out_label='parse-drop',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1
        # Write row
        pass_writer.writeDict(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
Exemplo n.º 9
0
def indexDbFile(db_file,
                field=default_index_field,
                out_file=None,
                out_args=default_out_args):
    """
    Adds an index column to a database file

    Arguments:
      db_file : the database file name.
      field : the name of the index field to add.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str : output file name.
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'index'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    printLog(log)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    __, __, out_args['out_type'] = splitName(db_file)

    # Append index field
    out_fields = list(db_iter.fields)
    out_fields.append(field)

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file,
                                      out_label='parse-index',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1

        # Add count and write updated row
        rec.update({field: rec_count})
        pass_writer.writeDict(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 10
0
def addDbFile(db_file,
              fields,
              values,
              out_file=None,
              out_args=default_out_args):
    """
    Adds field and value pairs to a database file

    Arguments:
      db_file : the database file name.
      fields : a list of fields to add.
      values : a list of values to assign to all rows of each field.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str : output file name.
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'add'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    log['VALUES'] = ','.join(values)
    printLog(log)

    # Open inut
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    __, __, out_args['out_type'] = splitName(db_file)

    # Add fields
    out_fields = list(db_iter.fields)
    out_fields.extend(fields)

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file,
                                      out_label='parse-add',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    pass_writer = TSVWriter(pass_handle, out_fields)

    # Count records
    result_count = countDbFile(db_file)

    # Define fields and values to append
    add_dict = {
        k: v
        for k, v in zip(fields, values) if k not in db_iter.fields
    }

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1
        # Write updated row
        rec.update(add_dict)
        pass_writer.writeDict(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 11
0
def convertToGenbank(db_file, inference=None, db_xref=None, molecule=default_molecule,
                     product=default_product, features=None, c_field=None, label=None,
                     count_field=None, index_field=None, allow_stop=False,
                     asis_id=False, asis_calls=False, allele_delim=default_allele_delim,
                     build_asn=False, asn_template=None, tbl2asn_exec=default_tbl2asn_exec,
                     format=default_format, out_file=None,
                     out_args=default_out_args):
    """
    Builds GenBank submission fasta and table files

    Arguments:
      db_file : the database file name.
      inference : reference alignment tool.
      db_xref : reference database link.
      molecule : source molecule (eg, "mRNA", "genomic DNA")
      product : Product (protein) name.
      features : dictionary of sample features (BioSample attributes) to add to the description of each record.
      c_field : column containing the C region gene call.
      label : a string to use as a label for the ID. if None do not add a field label.
      count_field : field name to populate the AIRR_READ_COUNT note.
      index_field : field name to populate the AIRR_CELL_INDEX note.
      allow_stop : if True retain records with junctions having stop codons.
      asis_id : if True use the original sequence ID for the output IDs.
      asis_calls : if True do not parse gene calls for IMGT nomenclature.
      allele_delim : delimiter separating the gene name from the allele number when asis_calls=True.
      build_asn : if True run tbl2asn on the generated .tbl and .fsa files.
      asn_template : template file (.sbt) to pass to tbl2asn.
      tbl2asn_exec : name of or path to the tbl2asn executable.
      format : input and output format.
      out_file : output file name without extension. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      tuple : the output (feature table, fasta) file names.
    """
    log = OrderedDict()
    log['START'] = 'ConvertDb'
    log['COMMAND'] = 'genbank'
    log['FILE'] = os.path.basename(db_file)
    printLog(log)

    # Define format operators
    try:
        reader, __, schema = getFormatOperators(format)
    except ValueError:
        printError('Invalid format %s.' % format)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = reader(db_handle)

    # Check for required columns
    try:
        required = ['sequence_input',
                    'v_call', 'd_call', 'j_call',
                    'v_seq_start', 'd_seq_start', 'j_seq_start']
        checkFields(required, db_iter.fields, schema=schema)
    except LookupError as e:
        printError(e)

    # Open output
    if out_file is not None:
        out_name, __ = os.path.splitext(out_file)
        fsa_handle = open('%s.fsa' % out_name, 'w')
        tbl_handle = open('%s.tbl' % out_name, 'w')
    else:
        fsa_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'], out_type='fsa')
        tbl_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'], out_type='tbl')

    # Count records
    result_count = countDbFile(db_file)

    # Define writer
    writer = csv.writer(tbl_handle, delimiter='\t', quoting=csv.QUOTE_NONE)

    # Iterate over records
    start_time = time()
    rec_count, pass_count, fail_count = 0, 0, 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1

        # Extract table dictionary
        name = None if asis_id else rec_count
        seq = makeGenbankSequence(rec, name=name, label=label, count_field=count_field, index_field=index_field,
                                  molecule=molecule, features=features)
        tbl = makeGenbankFeatures(rec, start=seq['start'], end=seq['end'], product=product,
                                  db_xref=db_xref, inference=inference, c_field=c_field,
                                  allow_stop=allow_stop, asis_calls=asis_calls, allele_delim=allele_delim)

        if tbl is not None:
            pass_count +=1
            # Write table
            writer.writerow(['>Features', seq['record'].id])
            for feature, qualifiers in tbl.items():
                writer.writerow(feature)
                if qualifiers:
                    for x in qualifiers:
                        writer.writerow(list(chain(['', '', ''], x)))

            # Write sequence
            SeqIO.write(seq['record'], fsa_handle, 'fasta')
        else:
            fail_count += 1

    # Final progress bar
    printProgress(rec_count, result_count, 0.05, start_time=start_time)

    # Run tbl2asn
    if build_asn:
        start_time = time()
        printMessage('Running tbl2asn', start_time=start_time, width=25)
        result = runASN(fsa_handle.name, template=asn_template, exec=tbl2asn_exec)
        printMessage('Done', start_time=start_time, end=True, width=25)

    # Print ending console log
    log = OrderedDict()
    log['OUTPUT_TBL'] = os.path.basename(tbl_handle.name)
    log['OUTPUT_FSA'] = os.path.basename(fsa_handle.name)
    log['RECORDS'] = rec_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertDb'
    printLog(log)

    # Close file handles
    tbl_handle.close()
    fsa_handle.close()
    db_handle.close()

    return (tbl_handle.name, fsa_handle.name)
Exemplo n.º 12
0
def convertToFasta(db_file, id_field=default_id_field, seq_field=default_seq_field,
                   meta_fields=None, out_file=None, out_args=default_out_args):
    """
    Builds fasta files from database records

    Arguments: 
      db_file : the database file name.
      id_field : the field containing identifiers.
      seq_field : the field containing sequences.
      meta_fields : a list of fields to add to sequence annotations.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str : output file name.
    """
    log = OrderedDict()
    log['START'] = 'ConvertDb'
    log['COMMAND'] = 'fasta'
    log['FILE'] = os.path.basename(db_file)
    log['ID_FIELD'] = id_field
    log['SEQ_FIELD'] = seq_field
    if meta_fields is not None:  log['META_FIELDS'] = ','.join(meta_fields)
    printLog(log)
    
    # Open input
    out_type = 'fasta'
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    result_count = countDbFile(db_file)

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'], out_type=out_type)

    # Iterate over records
    start_time = time()
    rec_count, pass_count, fail_count = 0, 0, 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1

        # Get SeqRecord
        seq = buildSeqRecord(rec, id_field, seq_field, meta_fields)

        # Write sequences
        if seq is not None:
            pass_count += 1
            SeqIO.write(seq, pass_handle, out_type)
        else:
            fail_count += 1
        
    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 13
0
def convertToBaseline(db_file, id_field=default_id_field, seq_field=default_seq_field,
                      germ_field=default_germ_field, cluster_field=None,
                      meta_fields=None, out_file=None, out_args=default_out_args):
    """
    Builds fasta files from database records

    Arguments: 
      db_file : the database file name.
      id_field : the field containing identifiers.
      seq_field : the field containing sample sequences.
      germ_field : the field containing germline sequences.
      cluster_field : the field containing clonal groupings;
                    if None write the germline for each record.
      meta_fields : a list of fields to add to sequence annotations.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
     str : output file name
    """
    log = OrderedDict()
    log['START'] = 'ConvertDb'
    log['COMMAND'] = 'fasta'
    log['FILE'] = os.path.basename(db_file)
    log['ID_FIELD'] = id_field
    log['SEQ_FIELD'] = seq_field
    log['GERM_FIELD'] = germ_field
    log['CLUSTER_FIELD'] = cluster_field
    if meta_fields is not None:  log['META_FIELDS'] = ','.join(meta_fields)
    printLog(log)
    
    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = TSVReader(db_handle)
    result_count = countDbFile(db_file)

    # Open output
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'], out_type='clip')
    # Iterate over records
    start_time = time()
    rec_count, germ_count, pass_count, fail_count = 0, 0, 0, 0
    cluster_last = None
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1
        
        # Update cluster ID
        cluster = rec.get(cluster_field, None)
        
        # Get germline SeqRecord when needed
        if cluster_field is None:
            germ = buildSeqRecord(rec, id_field, germ_field, meta_fields)
            germ.id = '>' + germ.id
        elif cluster != cluster_last:
            germ = buildSeqRecord(rec, cluster_field, germ_field)
            germ.id = '>' + germ.id            
        else:
            germ = None

        # Get read SeqRecord
        seq = buildSeqRecord(rec, id_field, seq_field, meta_fields)
        
        # Write germline
        if germ is not None:
            germ_count += 1
            SeqIO.write(germ, pass_handle, 'fasta')
        
        # Write sequences
        if seq is not None:
            pass_count += 1
            SeqIO.write(seq, pass_handle, 'fasta')
        else:
            fail_count += 1
        
        # Set last cluster ID
        cluster_last = cluster
        
    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['GERMLINES'] = germ_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 14
0
def convertToChangeo(db_file, out_file=None, out_args=default_out_args):
    """
    Converts an AIRR formatted file into an Change-O formatted file

    Arguments:
      db_file: the database file name.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str : output file name.
    """
    log = OrderedDict()
    log['START'] = 'ConvertDb'
    log['COMMAND'] = 'changeo'
    log['FILE'] = os.path.basename(db_file)
    printLog(log)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = AIRRReader(db_handle)

    # Set output fields replacing length with end fields
    in_fields = [AIRRSchema.toReceptor(f) for f in db_iter.fields]
    out_fields = []
    for f in in_fields:
        out_fields.append(f)
        if f in ReceptorData.end_fields and ReceptorData.end_fields[f][0] in in_fields:
            out_fields.append(ReceptorData.end_fields[f][1])
    out_fields = list(OrderedDict.fromkeys(out_fields))
    out_fields = [ChangeoSchema.fromReceptor(f) for f in out_fields]

    # Open output writer
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file, out_label='changeo', out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'], out_type=ChangeoSchema.out_type)
    pass_writer = ChangeoWriter(pass_handle, fields=out_fields)

    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1
        # Write records
        pass_writer.writeReceptor(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ConvertDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 15
0
def insertGaps(db_file, references=None, format=default_format,
               out_file=None, out_args=default_out_args):
    """
    Inserts IMGT numbering into V fields

    Arguments:
      db_file : the database file name.
      references : folder with germline repertoire files. If None, do not updated alignment columns wtih IMGT gaps.
      format : input format.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
     str : output file name
    """
    log = OrderedDict()
    log['START'] = 'ConvertDb'
    log['COMMAND'] = 'imgt'
    log['FILE'] = os.path.basename(db_file)
    printLog(log)

    # Define format operators
    try:
        reader, writer, schema = getFormatOperators(format)
    except ValueError:
        printError('Invalid format %s.' % format)

    # Open input
    db_handle = open(db_file, 'rt')
    db_iter = reader(db_handle)

    # Check for required columns
    try:
        required = ['sequence_imgt', 'v_germ_start_imgt']
        checkFields(required, db_iter.fields, schema=schema)
    except LookupError as e:
        printError(e)

    # Load references
    reference_dict = readGermlines(references)

    # Check for IMGT-gaps in germlines
    if all('...' not in x for x in reference_dict.values()):
        printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')

    # Open output writer
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(db_file, out_label='gap', out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'], out_type=schema.out_type)
    pass_writer = writer(pass_handle, fields=db_iter.fields)

    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = pass_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += 1
        # Update IMGT fields
        imgt_dict = correctIMGTFields(rec, reference_dict)
        # Write records
        if imgt_dict is not None:
            pass_count += 1
            rec.setDict(imgt_dict, parse=False)
            pass_writer.writeReceptor(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['PASS'] = pass_count
    log['FAIL'] = rec_count - pass_count
    log['END'] = 'ConvertDb'
    printLog(log)

    # Close file handles
    pass_handle.close()
    db_handle.close()

    return pass_handle.name
Exemplo n.º 16
0
def createGermlines(db_file, references, seq_field=default_seq_field, v_field=default_v_field,
                    d_field=default_d_field, j_field=default_j_field,
                    cloned=False, clone_field=default_clone_field, germ_types=default_germ_types,
                    format=default_format, out_file=None, out_args=default_out_args):
    """
    Write germline sequences to tab-delimited database file

    Arguments:
      db_file : input tab-delimited database file.
      references : folders and/or files containing germline repertoire data in FASTA format.
      seq_field : field in which to look for sequence.
      v_field : field in which to look for V call.
      d_field : field in which to look for D call.
      j_field : field in which to look for J call.
      cloned : if True build germlines by clone, otherwise build individual germlines.
      clone_field : field containing clone identifiers; ignored if cloned=False.
      germ_types : list of germline sequence types to be output from the set of 'full', 'dmask', 'vonly', 'regions'
      format : input and output format.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : arguments for output preferences.

    Returns:
      dict: names of the 'pass' and 'fail' output files.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'CreateGermlines'
    log['FILE'] = os.path.basename(db_file)
    log['GERM_TYPES'] = ','.join(germ_types)
    log['SEQ_FIELD'] = seq_field
    log['V_FIELD'] = v_field
    log['D_FIELD'] = d_field
    log['J_FIELD'] = j_field
    log['CLONED'] = cloned
    if cloned:  log['CLONE_FIELD'] = clone_field
    printLog(log)

    # Define format operators
    try:
        reader, writer, schema = getFormatOperators(format)
    except ValueError:
        printError('Invalid format %s' % format)
    out_args['out_type'] = schema.out_type

    # TODO: this won't work for AIRR necessarily
    # Define output germline fields
    germline_fields = OrderedDict()
    seq_type = seq_field.split('_')[-1]
    if 'full' in germ_types:  germline_fields['full'] = 'germline_' + seq_type
    if 'dmask' in germ_types:  germline_fields['dmask'] = 'germline_' + seq_type + '_d_mask'
    if 'vonly' in germ_types:  germline_fields['vonly'] = 'germline_' + seq_type + '_v_region'
    if 'regions' in germ_types:  germline_fields['regions'] = 'germline_regions'
    if cloned:
        germline_fields['v'] = 'germline_v_call'
        germline_fields['d'] = 'germline_d_call'
        germline_fields['j'] = 'germline_j_call'
    out_fields = getDbFields(db_file,
                             add=[schema.fromReceptor(f) for f in germline_fields.values()],
                             reader=reader)

    # Get repertoire and open Db reader
    reference_dict = readGermlines(references)
    db_handle = open(db_file, 'rt')
    db_iter = reader(db_handle)

    # Check for required columns
    try:
        required = ['v_germ_start_imgt', 'd_germ_start', 'j_germ_start',
                    'np1_length', 'np2_length']
        checkFields(required, db_iter.fields, schema=schema)
    except LookupError as e:
        printError(e)

    # Check for IMGT-gaps in germlines
    if all('...' not in x for x in reference_dict.values()):
        printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')

    # Count input
    total_count = countDbFile(db_file)

    # Check for existence of fields
    for f in [v_field, d_field, j_field, seq_field]:
        if f not in db_iter.fields:
            printError('%s field does not exist in input database file.' % f)

    # Translate to Receptor attribute names
    v_field = schema.toReceptor(v_field)
    d_field = schema.toReceptor(d_field)
    j_field = schema.toReceptor(j_field)
    seq_field = schema.toReceptor(seq_field)
    clone_field = schema.toReceptor(clone_field)

    # Define Receptor iterator
    if cloned:
        start_time = time()
        printMessage('Sorting by clone', start_time=start_time, width=20)
        sorted_records = sorted(db_iter, key=lambda x: x.getField(clone_field))
        printMessage('Done', start_time=start_time, end=True, width=20)
        receptor_iter = groupby(sorted_records, lambda x: x.getField(clone_field))
    else:
        receptor_iter = ((x.sequence_id, [x]) for x in db_iter)

    # Define log handle
    if out_args['log_file'] is None:
        log_handle = None
    else:
        log_handle = open(out_args['log_file'], 'w')

    # Initialize handles, writers and counters
    pass_handle, pass_writer = None, None
    fail_handle, fail_writer = None, None
    rec_count, pass_count, fail_count = 0, 0, 0
    start_time = time()

    # Iterate over rows
    for key, records in receptor_iter:
        # Print progress
        printProgress(rec_count, total_count, 0.05, start_time=start_time)

        # Define iteration variables
        records = list(records)
        rec_log = OrderedDict([('ID', key)])
        rec_count += len(records)

        # Build germline for records
        if len(records) == 1:
            germ_log, germlines, genes = buildGermline(records[0], reference_dict, seq_field=seq_field, v_field=v_field,
                                                       d_field=d_field, j_field=j_field)
        else:
            germ_log, germlines, genes = buildClonalGermline(records, reference_dict, seq_field=seq_field, v_field=v_field,
                                                             d_field=d_field, j_field=j_field)
        rec_log.update(germ_log)

        # Write row to pass or fail file
        if germlines is not None:
            pass_count += len(records)

            # Add germlines to Receptor record
            annotations = {}
            if 'full' in germ_types:  annotations[germline_fields['full']] = germlines['full']
            if 'dmask' in germ_types:  annotations[germline_fields['dmask']] = germlines['dmask']
            if 'vonly' in germ_types:  annotations[germline_fields['vonly']] = germlines['vonly']
            if 'regions' in germ_types:  annotations[germline_fields['regions']] = germlines['regions']
            if cloned:
                annotations[germline_fields['v']] = genes['v']
                annotations[germline_fields['d']] = genes['d']
                annotations[germline_fields['j']] = genes['j']

            # Write records
            try:
                for r in records:
                    r.setDict(annotations)
                    pass_writer.writeReceptor(r)
            except AttributeError:
                # Create output file handle and writer
                if out_file is not None:
                    pass_handle = open(out_file, 'w')
                else:
                    pass_handle = getOutputHandle(db_file,
                                                  out_label='germ-pass',
                                                  out_dir=out_args['out_dir'],
                                                  out_name=out_args['out_name'],
                                                  out_type=out_args['out_type'])
                pass_writer = writer(pass_handle, fields=out_fields)
                for r in records:
                    r.setDict(annotations)
                    pass_writer.writeReceptor(r)
        else:
            fail_count += len(records)
            if out_args['failed']:
                try:
                    fail_writer.writeReceptor(records)
                except AttributeError:
                    fail_handle = getOutputHandle(db_file,
                                                  out_label='germ-fail',
                                                  out_dir=out_args['out_dir'],
                                                  out_name=out_args['out_name'],
                                                  out_type=out_args['out_type'])
                    fail_writer = writer(fail_handle, fields=out_fields)
                    fail_writer.writeReceptor(records)

        # Write log
        printLog(rec_log, handle=log_handle)

    # Print log
    printProgress(rec_count, total_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
    log['RECORDS'] = rec_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'CreateGermlines'
    printLog(log)

    # Close file handles
    db_handle.close()
    output = {'pass': None, 'fail': None}
    if pass_handle is not None:
        output['pass'] = pass_handle.name
        pass_handle.close()
    if fail_handle is not None:
        output['fail'] = fail_handle.name
        fail_handle.close()
    if log_handle is not None:
        log_handle.close()

    return output