示例#1
0
def load_rearrangement(filename, validate=False, debug=False):
    """
    Load the contents of an AIRR rearrangements file into a data frame

    Arguments:
      filename (str): input file path.
      validate (bool): whether to validate data as it is read, raising a ValidationError
                       exception in the event of an error.
      debug (bool): debug flag. If True print debugging information to standard error.

    Returns:
      pandas.DataFrame: Rearrangement records as rows of a data frame.
    """
    # TODO: test pandas.DataFrame.read_csv with converters argument as an alterative
    schema = RearrangementSchema

    df = pd.read_csv(filename,
                     sep='\t',
                     header=0,
                     index_col=None,
                     dtype=schema.pandas_types(),
                     true_values=schema.true_values,
                     false_values=schema.false_values)
    # added to use RearrangementReader without modifying it:
    buffer = StringIO()  # create an empty buffer
    df.to_csv(buffer, sep='\t', index=False)  # fill buffer
    buffer.seek(0)  # set to the start of the stream

    reader = RearrangementReader(buffer, validate=validate, debug=debug)

    df = pd.DataFrame(list(reader))
    return df
示例#2
0
def merge_rearrangement(out_filename, in_filenames, drop=False, debug=False):
    """
    Merge one or more AIRR rearrangements files

    Arguments:
      out_filename (str): output file path.
      in_filenames (list): list of input files to merge.
      drop (bool): drop flag. If True then drop fields that do not exist in all input
                   files, otherwise combine fields from all input files.
      debug (bool): debug flag. If True print debugging information to standard error.

    Returns:
      bool: True if files were successfully merged, otherwise False.
    """
    try:
        # gather fields from input files
        readers = (RearrangementReader(open(f, 'r'), debug=False)
                   for f in in_filenames)
        field_list = [x.fields for x in readers]
        if drop:
            field_set = set.intersection(*map(set, field_list))
        else:
            field_set = set.union(*map(set, field_list))
        field_order = OrderedDict([(f, None) for f in chain(*field_list)])
        out_fields = [f for f in field_order if f in field_set]

        # write input files to output file sequentially
        readers = (RearrangementReader(open(f, 'r'), debug=debug)
                   for f in in_filenames)
        with open(out_filename, 'w+') as handle:
            writer = RearrangementWriter(handle,
                                         fields=out_fields,
                                         debug=debug)
            for reader in readers:
                for r in reader:
                    writer.write(r)
                reader.close()
    except Exception as e:
        sys.stderr.write(
            'Error occurred while merging AIRR rearrangement files: %s\n' % e)
        return False

    return True
示例#3
0
def read_rearrangement(filename, validate=False, debug=False):
    """
    Open an iterator to read an AIRR rearrangements file

    Arguments:
      file (str): path to the input file.
      validate (bool): whether to validate data as it is read, raising a ValidationError
                       exception in the event of an error.
      debug (bool): debug flag. If True print debugging information to standard error.

    Returns:
      airr.io.RearrangementReader: iterable reader class.
    """

    return RearrangementReader(open(filename, 'r'), validate=validate, debug=debug)
示例#4
0
def validate_rearrangement(filename, debug=False):
    """
    Validates one or more AIRR rearrangements files

    Arguments:
      filename (str): path of the file to validate.
      debug (bool): debug flag. If True print debugging information to standard error.

    Returns:
      bool: True if files passed validation, otherwise False.
    """
    valid = True
    if debug:
        sys.stderr.write('Validating: %s\n' % filename)

    # Open reader
    handle = open(filename, 'r')
    reader = RearrangementReader(handle, validate=True)

    # Validate header
    try:
        iter(reader)
    except ValidationError as e:
        valid = False
        if debug:
            sys.stderr.write('%s has validation error: %s\n' % (filename, e))

    # Validate each row
    i = 0
    while True:
        try:
            i = i + 1
            next(reader)
        except StopIteration:
            break
        except ValidationError as e:
            valid = False
            if debug:
                sys.stderr.write('%s at record %i has validation error: %s\n' %
                                 (filename, i, e))

    # Close
    handle.close()

    return valid
示例#5
0
def derive_rearrangement(out_filename, in_filename, fields=None, debug=False):
    """
    Create an empty AIRR rearrangements file with fields derived from an existing file

    Arguments:
      out_filename (str): output file path.
      in_filename (str): existing file to derive fields from.
      fields (list): additional non-required fields to add to the output.
      debug (bool): debug flag. If True print debugging information to standard error.

    Returns:
      airr.io.RearrangementWriter: open writer class.
    """
    reader = RearrangementReader(open(in_filename, 'r'))
    in_fields = list(reader.fields)
    if fields is not None:
        in_fields.extend([f for f in fields if f not in in_fields])

    return RearrangementWriter(open(out_filename, 'w+'), fields=in_fields, debug=debug)
示例#6
0
def load_rearrangement(filename, validate=False, debug=False):
    """
    Load the contents of an AIRR rearrangements file into a data frame

    Arguments:
      filename (str): input file path.
      validate (bool): whether to validate data as it is read, raising a ValidationError
                       exception in the event of an error.
      debug (bool): debug flag. If True print debugging information to standard error.

    Returns:
      pandas.DataFrame: Rearrangement records as rows of a data frame.
    """
    # TODO: test pandas.DataFrame.read_csv with converters argument as an alterative
    # schema = RearrangementSchema
    # df = pd.read_csv(handle, sep='\t', header=0, index_col=None,
    #                  dtype=schema.numpy_types(), true_values=schema.true_values,
    #                  false_values=schema.true_values)
    # return df
    with open(filename, 'r') as handle:
        reader = RearrangementReader(handle, validate=validate, debug=debug)
        df = pd.DataFrame(list(reader))
    return df
    def processAIRRTSVFile(self, file_handle, path):

        # Start a timer for performance reasons.
        t_start_full = time.perf_counter()

        # Get the AIRR Map object for this class (for convenience).
        airr_map = self.getAIRRMap()

        # Set the tag for the repository that we are using.
        repository_tag = self.getRepositoryTag()
        # Set the tag for the iReceptor identifier.
        ireceptor_tag = self.getiReceptorTag()

        # Get the fields to use for finding repertoire IDs, either using those IDs
        # directly or by looking for a repertoire ID based on a rearrangement file
        # name.
        repertoire_link_field = self.getRepertoireLinkIDField()
        rearrangement_link_field = self.getRearrangementLinkIDField()
        rearrangement_file_field = self.getRearrangementFileField()

        # Set the tag for the file mapping that we are using. Ths is essentially the
        # look up into the columns of the AIRR Mapping that we are using. For the IgBLAST
        # parser it is normally the "igblast" column (which is essentially the same as
        # AIRR TSV), but it can be overridden by the user.
        filemap_tag = self.getFileMapping()

        # Set the size of each chunk of data that is inserted.
        chunk_size = self.getRepositoryChunkSize()

        # Validate the AIRR TSV file header. We do not validate the entire
        # file becasue that is too expensive of an operation.
        # Validate header by trying to read the first record. If it throws
        # an error then we have a problem.
        airr_reader = RearrangementReader(file_handle,
                                          validate=True,
                                          debug=True)
        airr_valid = True
        try:
            airr_iterator = iter(airr_reader)
            first_record = next(airr_iterator)
        except ValidationError as e:
            airr_valid = False
            print("ERROR: File %s is not a valid AIRR TSV file, %s" %
                  (path, e))
            return False
        if airr_valid:
            print("Info: File %s has a valid AIRR TSV header" % (path))

        # Get root filename from the path
        filename = os.path.basename(path)

        # Get the single, unique repertoire link id for the filename we are loading. If
        # we can't find one, this is an error and we return failure.
        repertoire_link_id = self.getRepertoireInfo(filename)
        if repertoire_link_id is None:
            print("ERROR: Could not link file %s to a valid repertoire" %
                  (filename))
            return False

        # Extract the fields that are of interest for this file. Essentiall all non null
        # fields in the file. This is a boolean array that is T everywhere there is a
        # notnull field in the column of interest.
        map_column = airr_map.getRearrangementMapColumn(filemap_tag)
        fields_of_interest = map_column.notnull()

        # We select the rows in the mapping that contain fields of interest from the file.
        # At this point, file_fields contains N columns that contain our mappings for the
        # the specific formats (e.g. iReceptor, AIRR, VQuest). The rows are limited to
        # only data that is relevant to the file format column of interest.
        file_fields = airr_map.getRearrangementRows(fields_of_interest)

        # We need to build the set of fields that the repository can store. We don't
        # want to extract fields that the repository doesn't want.
        igblastColumns = []
        columnMapping = {}
        if self.verbose():
            print("Info: Dumping expected %s (%s) to repository mapping" %
                  (self.getAnnotationTool(), filemap_tag))
        for index, row in file_fields.iterrows():
            if self.verbose():
                print("Info:    %s -> %s" %
                      (str(row[filemap_tag]), str(row[repository_tag])))
            # If the repository column has a value for the field in the file, track the
            # field from both the file and repository side.
            if not pd.isnull(row[repository_tag]):
                igblastColumns.append(row[filemap_tag])
                columnMapping[row[filemap_tag]] = row[repository_tag]
            else:
                print("Info:     Repository does not support " +
                      str(row[filemap_tag]) +
                      ", not inserting into repository")

        # Get the field names from the file from the airr_reader object.
        # Determing the mapping from the file input to the repository.
        finalMapping = {}
        for airr_field in airr_reader.fields:
            if airr_field in columnMapping:
                if self.verbose():
                    print("Info: Mapping %s field in file: %s -> %s" %
                          (self.getAnnotationTool(), airr_field,
                           columnMapping[airr_field]))
                finalMapping[airr_field] = columnMapping[airr_field]
            else:
                if self.verbose():
                    print("Info: No mapping for input " +
                          self.getAnnotationTool() + " field " + airr_field +
                          ", adding to repository without mapping.")

        # Determine if we are missing any repository columns from the input data.
        for igblast_column, mongo_column in columnMapping.items():
            if not igblast_column in airr_reader.fields:
                if self.verbose():
                    print("Info: Missing data in input " +
                          self.getAnnotationTool() + " file for " +
                          igblast_column)

        # Create a reader for the data frame with step size "chunk_size"
        if self.verbose():
            print("Info: Processing raw data frame...")
        airr_df_reader = pd.read_csv(path, sep='\t', chunksize=chunk_size)

        # Iterate over the file with data frames of size "chunk_size"
        total_records = 0
        for airr_df in airr_df_reader:
            # Remap the column names. We need to remap because the columns may be in a
            # differnt order in the file than in the column mapping.
            airr_df.rename(finalMapping, axis='columns', inplace=True)

            # Build the substring array that allows index for fast searching of
            # Junction AA substrings.
            junction_aa = airr_map.getMapping("junction_aa", ireceptor_tag,
                                              repository_tag)
            ir_substring = airr_map.getMapping("ir_substring", ireceptor_tag,
                                               repository_tag)
            ir_junc_aa_len = airr_map.getMapping("ir_junction_aa_length",
                                                 ireceptor_tag, repository_tag)
            if junction_aa in airr_df:
                if self.verbose():
                    print(
                        "Info: Retrieving junction AA and building substrings",
                        flush=True)
                airr_df[ir_substring] = airr_df[junction_aa].apply(
                    Rearrangement.get_substring)

                # The AIRR TSV format doesn't have AA length, we want it in repository.
                if not (ir_junc_aa_len in airr_df):
                    if self.verbose():
                        print("Info: Computing junction amino acids length...",
                              flush=True)
                    airr_df[ir_junc_aa_len] = airr_df[junction_aa].apply(
                        Parser.len_null_to_null)

            # We need to look up the "known parameter" from an iReceptor perspective (the
            # field name in the iReceptor column mapping and map that to the correct
            # field name for the repository we are writing to.
            v_call = airr_map.getMapping("v_call", ireceptor_tag,
                                         repository_tag)
            d_call = airr_map.getMapping("d_call", ireceptor_tag,
                                         repository_tag)
            j_call = airr_map.getMapping("j_call", ireceptor_tag,
                                         repository_tag)
            ir_vgene_gene = airr_map.getMapping("ir_vgene_gene", ireceptor_tag,
                                                repository_tag)
            ir_dgene_gene = airr_map.getMapping("ir_dgene_gene", ireceptor_tag,
                                                repository_tag)
            ir_jgene_gene = airr_map.getMapping("ir_jgene_gene", ireceptor_tag,
                                                repository_tag)
            ir_vgene_family = airr_map.getMapping("ir_vgene_family",
                                                  ireceptor_tag,
                                                  repository_tag)
            ir_dgene_family = airr_map.getMapping("ir_dgene_family",
                                                  ireceptor_tag,
                                                  repository_tag)
            ir_jgene_family = airr_map.getMapping("ir_jgene_family",
                                                  ireceptor_tag,
                                                  repository_tag)

            # Build the v_call field, as an array if there is more than one gene
            # assignment made by the annotator.
            self.processGene(airr_df, v_call, v_call, ir_vgene_gene,
                             ir_vgene_family)
            self.processGene(airr_df, j_call, j_call, ir_jgene_gene,
                             ir_jgene_family)
            self.processGene(airr_df, d_call, d_call, ir_dgene_gene,
                             ir_dgene_family)
            # If we don't already have a locus (that is the data file didn't provide one)
            # then calculate the locus based on the v_call array.
            locus = airr_map.getMapping("locus", ireceptor_tag, repository_tag)
            if not locus in airr_df:
                airr_df[locus] = airr_df[v_call].apply(Rearrangement.getLocus)

            # Keep track of the reperotire id so can link each rearrangement to
            # a repertoire
            rep_rearrangement_link_field = airr_map.getMapping(
                rearrangement_link_field, ireceptor_tag, repository_tag)
            airr_df[rep_rearrangement_link_field] = repertoire_link_id

            # Set the relevant IDs for the record being inserted. If it fails, don't
            # load any data.
            if not self.checkIDFields(airr_df, repertoire_link_id):
                return False

            # Create the created and update values for this block of records. Note that
            # this means that each block of inserts will have the same date.
            now_str = Rearrangement.getDateTimeNowUTC()
            ir_created_at = airr_map.getMapping("ir_created_at", ireceptor_tag,
                                                repository_tag)
            ir_updated_at = airr_map.getMapping("ir_updated_at", ireceptor_tag,
                                                repository_tag)
            airr_df[ir_created_at] = now_str
            airr_df[ir_updated_at] = now_str

            # Transform the data frame so that it meets the repository type requirements
            if not self.mapToRepositoryType(airr_df):
                print("ERROR: Unable to map data to the repository")
                return False

            # Insert the chunk of records into Mongo.
            num_records = len(airr_df)
            print("Info: Inserting",
                  num_records,
                  "records into Mongo...",
                  flush=True)
            t_start = time.perf_counter()
            records = json.loads(airr_df.T.to_json()).values()
            self.repositoryInsertRearrangements(records)
            t_end = time.perf_counter()
            print("Info: Inserted records, time =", (t_end - t_start),
                  "seconds",
                  flush=True)

            # Keep track of the total number of records processed.
            total_records = total_records + num_records
            print("Info: Total records so far =", total_records, flush=True)

        # Get the number of annotations for this repertoire (as defined by the
        # repertoire link id
        if self.verbose():
            print("Info: Getting the number of annotations for repertoire %s" %
                  (str(repertoire_link_id)))
        annotation_count = self.repositoryCountRearrangements(
            repertoire_link_id)
        if annotation_count == -1:
            print("ERROR: invalid annotation count (%d), write failed." %
                  (annotation_count))
            return False

        if self.verbose():
            print("Info: Annotation count = %d" % (annotation_count),
                  flush=True)

        # Set the cached sequence count field for the repertoire.
        self.repositoryUpdateCount(repertoire_link_id, annotation_count)

        # Inform on what we added and the total count for the this record.
        t_end_full = time.perf_counter()
        print(
            "Info: Inserted %d records, annotation count = %d, %f s, %f insertions/s"
            % (total_records, annotation_count, t_end_full - t_start_full,
               total_records / (t_end_full - t_start_full)),
            flush=True)

        return True