Exemplo n.º 1
0
def parse_annotations_to_dict(annotations_fileh):
    """Parse a CSV annotations file to a dictionary.

    The annotations file should have a column titled "gene_id" which has
    the gene/gene product ID, and a column titled "term" which contains
    the name or ID of a term by which the gene/product is annotated. The
    file may have additional columns, which will be ignored.

    Returns a `convstructs.TwoWaySetDict` instance with annotation as
    strings and `set`s of genes as values.

    :Parameters:
    - `annotations_fileh`: a CSV file with a header line as the first
      line

    """
    annotations_dict = convstructs.TwoWaySetDict()
    csv_reader = convutils.make_csv_reader(annotations_fileh)
    for entry in csv_reader:
        gene = entry["gene_id"]
        term = entry["term"]
        if term in annotations_dict:
            annotations_dict.add_item(term, gene)
        else:
            annotations_dict[term] = set([gene])

    return annotations_dict
Exemplo n.º 2
0
def parse_interactions_file_to_graph(interactions_fileh):
    """Parse a CSV interactions file to a graph.

    The interactions file should have two columns with headings
    "interactor1" and "interactor2". If it contains an additional column
    with header "weight", values in that column will be used as the
    weight or "confidence" in the interaction. The file may have
    additional columns, which will be ignored.

    Returns a graph with genes/gene products as nodes and interactions
    as (weighted) edges.

    :Parameters:
    - `interactions_fileh`: a CSV file with a header line as the first
      line

    """
    interactions_graph = structures.EdgeSwapGraph()
    csv_reader = convutils.make_csv_reader(interactions_fileh)
    for entry in csv_reader:
        node1 = entry["interactor1"]
        node2 = entry["interactor2"]
        if "weight" in entry:
            weight = float(entry["weight"])
            interactions_graph.add_edge(node1, node2, weight=weight)
        else:
            interactions_graph.add_edge(node1, node2, weight=1)

    return interactions_graph
Exemplo n.º 3
0
def parse_selected_links_file(selected_links_fileh):
    """Parse a CSV pairs file to an iterator of links.

    The file should have no header and only two columns, where the
    annotation in the first column needs to be tested if it is "linked
    to" the annotation in the second column.

    NOTE: This is a generator; it will yield links until the file is
    completely consumed.

    :Parameters:
    - `selected_links_fileh`: a CSV file of two columns and no headers
      with annotations in the columns

    """
    csv_reader = convutils.make_csv_reader(selected_links_fileh, False)
    for i, link in enumerate(csv_reader):
        assert len(link) == 2, "Line %d has fewer or greater than " "two annotation entries." % i + 1
        yield tuple(link)
Exemplo n.º 4
0
def parse_expression_file(expression_fileh):
    """Parse a CSV expression file.

    Returns a dictionary with gene (product) identifiers as keys and
    expression values as values.

    :Parameters:
    - `expression_fileh`: a CSV file of gene (or gene product)
      expression values. The file should have a column titled "id" which
      has the gene (or gene product) ID, and a column titled
      "expression" which gives a value for the expression level, or
      difference in expression levels.

    """
    csv_file = convutils.make_csv_reader(expression_fileh)
    expressions_dict = {}
    warned_of_multiple_values = False
    for i, entry in enumerate(csv_file):
        expression_value = float(entry["expression"])
        if entry["id"] in expressions_dict:
            # We've already seen an entry for this ID
            if not warned_of_multiple_values:
                logger.warning(
                    "WARNING! Multiple expression values " "detected for at least one gene; continuing " "anyway."
                )
                warned_of_multiple_values = True
            # msg = ("Warning: on line %d: %s has already been seen; "
            # "continuing anyway" % (i + 1, entry['id']))
            # logger.warning(msg)
            # msg = "Error on line %d: %s has already been seen" % (
            # i + 1, entry['id'])
            # raise DuplicateIDError(msg)
            if expression_value > expressions_dict[entry["id"]]:
                expressions_dict[entry["id"]] = expression_value
        else:
            expressions_dict[entry["id"]] = expression_value
    return expressions_dict
def parse_set_significances(significances_fileh):
    """Parses the set significances from a CSV file.

    Returns a dictionary with set names as keys and their significances
    as a list of floats.

    :Parameters:
    - `significances_fileh`: A CSV file containing gene set
      significances. The file should contain a header row, which will
      be ignored. The first column should contain the name of each gene
      set; each column following should correspond with that gene set's
      significance in each file, in order with respect to the input
      order on the command line.

    """
    csv_reader = convutils.make_csv_reader(significances_fileh,
            headers=False)
    csv_reader.next()
    set_significances = {}
    for entry in csv_reader:
        significances = [(get_sign(value), abs(float(value))) for value
                in entry[1:]]
        set_significances[entry[0]] = significances
    return set_significances
def results_to_edges_and_stats(
        csv_fileh,
        significance_cutoff,
        annotation1_column_title,
        annotation2_column_title,
        significance_column_title,
        annotation1_size_column_title,
        annotation1_neighbors_size_column_title,
        annotation2_size_column_title,
        selected_annotations,
        less_than=True
    ):
    """Reads results from a CSV file and converts them into edges.

    Returns a dictionary with annotation pairs as the keys and their
    p-values as values.

    :Parameters:
    - `csv_fileh`: a CSV file with column headers
    - `significance_cutoff`: a value to use as the threshold for
      including an edge
    - `annotation1_column_title`: title of the column containing the
      name of the first annotation
    - `annotation2_column_title`: title of the column containing the
      name of the second annotation
    - `significance_column_title`: title of the column containing
      significance values (e.g., `'pvalue'`)
    - `annotation1_size_column_title`: title of the column containing
      the size of the set of genes annotated by the first annotation
    - `annotation1_neighbors_size_column_title`: title of the column
      containing the size of the set of genes neighboring those
      annotated by the first annotation
    - `annotation2_size_column_title`: title of the column containing
      the size of the set of genes annotated by the second annotation
    - `selected_annotations`: a `set` of annotations to which
      interactions should be restricted
    - `less_than`: whether the significance should be less than or
      equal to the `significance_cutoff` to be considered significant,
      or be greater than or equal to the `significance_cutoff`
      [default: `True`]

    """
    csv_reader = convutils.make_csv_reader(csv_fileh)
    edges = {}
    annotation_stats = {}
    for entry in csv_reader:
        significance = float(entry[significance_column_title])
        if less_than:
            significant = significance <= significance_cutoff
        else:
            significant = significance >= significance_cutoff
        if significant:
            annotation1 = entry[annotation1_column_title]
            annotation2 = entry[annotation2_column_title]
            if selected_annotations is not None:
                # If we are restricting the annotations to include, skip
                # the rest of this if either of the annotation terms
                # aren't in the set of selected terms.
                if (not annotation1 in selected_annotations) or (not
                        annotation2 in selected_annotations):
                    continue
            pair_key = (annotation1, annotation2)
            # TODO: Commented out for mcmcbpn hack, fix
            #annotation1_size = entry[annotation1_size_column_title]
            #annotation1_neighbors_size = \
                    #entry[annotation1_neighbors_size_column_title]
            #annotation2_size = entry[annotation2_size_column_title]
            ## We should probably check if these stats have already been
            ## put in the annotations_stats dictionary, but I'm lazy and
            ## they *should* be consistent.
            #if annotation1 not in annotation_stats:
                #annotation_stats[annotation1] = {
                        #'size': int(annotation1_size),
                        #'neighbors': int(annotation1_neighbors_size)
                #}
            #else:
                #annotation_stats[annotation1]['size'] = \
                        #int(annotation1_size)
                #annotation_stats[annotation1]['neighbors'] = \
                        #int(annotation1_neighbors_size)
            #if annotation2 not in annotation_stats:
                #annotation_stats[annotation2] = {
                    #'size': annotation2_size
                #}
            #else:
                #annotation_stats[annotation2]['size'] = \
                        #int(annotation2_size)
            ## Keep only the edge statistics in the dictionary
            #del entry[annotation1_column_title]
            #del entry[annotation2_column_title]
            #del entry[annotation1_size_column_title]
            #del entry[annotation1_neighbors_size_column_title]
            #del entry[annotation2_size_column_title]
            edges[pair_key] = entry
    return edges, annotation_stats