예제 #1
0
def _parse_by_pmid(input_fp):

    # Will go smallest to largest pubmed id, in two passes
    # First pass: collect pubmed ids with pointers to their interactions
    scanner = parse_mitab_file(input_fp, full_mitab_iterator)
    pmids_map = offsets_by_pmid_consumer(scanner)
    sorted_pmids = sorted((len(v), k) for k, v in pmids_map.iteritems())

    # Second pass
    for _, pmid in sorted_pmids:
        scanner = parse_mitab_file(input_fp, partial_mitab_iterator,
                                   (pmids_map[pmid], ))
        pairs, complexes = full_interaction_consumer(scanner)
        yield pmid, pairs, complexes
예제 #2
0
def process_biochem(biochem_file, output_fp, ontology, logfile_fp, counts):
    """
    Process all candidate 'biochemical' interactions in biochem_file and write
    consolidated interactions to output_fp.
    """
    input_fp = open(biochem_file, 'rU')
    scanner = parse_mitab_file(input_fp, full_mitab_iterator)
    pairs, complexes = full_interaction_consumer(scanner)

    counts.pmids.update(intr.pmid for intr in pairs)
    counts.pmids.update(intr.pmid for intr in complexes)

    # Extract additional interactions from 'complexes', which are in fact
    # biochemical reactions described in more detail
    pairs2, complexes2 = _extract_biochem_from_complexes(complexes)
    # Collect source interactions by (directed) pair of proteins plus pubmed id
    pmid_links, unassigned = _collect_by_pmid(pairs + pairs2, _get_biochem_key)
    # Find conflicts and inconsistencies
    _get_biochem_key_conflicts(pmid_links, logfile_fp)
    # Attempt to assign interactions from DIP, which are not directional
    unresolved = _collect_biochem_unassigned(unassigned, pmid_links)

    # Consolidate each interaction and write to file
    sorted_keys = sorted(pmid_links.keys())
    for key in sorted_keys:
        interactions = pmid_links[key]
        counts.biochem += len(interactions)

        _correct_biogrid_biochem(interactions)

        for new_intr in _consolidate_links(interactions, ontology, 'D',
                                           logfile_fp, counts):
            new_intr.to_file(output_fp)
            counts.directed += 1

    # Write all 'unused' interactions
    for item in unresolved:
        key = _get_undirected_key(item)
        for new_intr in _consolidate_links([item], ontology, 'B', logfile_fp,
                                           counts):
            new_intr.to_file(output_fp)
            counts.undirected += 1

    # Write unused complexes
    for cmplx in complexes2:
        key = _get_undirected_key(cmplx)
        for new_intr in _consolidate_links([cmplx], ontology, 'C', logfile_fp,
                                           counts):
            new_intr.to_file(output_fp)
            counts.complexes += 1

    input_fp.close()
예제 #3
0
def process_complexes(complexes_file, output_fp, ontology, logfile_fp, counts):
    """
    Process all candidate 'complexes' in complexes_file and write
    consolidated interactions to output_fp.
    """
    input_fp = open(complexes_file, 'rU')
    scanner = parse_mitab_file(input_fp, full_mitab_iterator)
    offsets_map = _consume_complex_offsets_only(scanner)

    # Consolidate each interaction and write to file
    sorted_keys = sorted(offsets_map.iterkeys())
    for key in sorted_keys:
        scanner = parse_mitab_file(input_fp, partial_mitab_iterator,
                                   (offsets_map[key], ))
        _, complexes = full_interaction_consumer(scanner)
        counts.pmids.update(intr.pmid for intr in complexes)

        for new_intr in _consolidate_complexes(complexes, ontology,
                                               logfile_fp):
            new_intr.to_file(output_fp)
            counts.complexes += 1
    input_fp.close()
예제 #4
0
    def __call__(self, ppiTrim_file, *dbargs, **dbkwargs):

        self.connect_to_db(*dbargs, **dbkwargs)
        input_fp = open(ppiTrim_file, 'rU')
        scanner = parse_mitab_file(input_fp, full_mitab_iterator)
        for intr, _ in scanner:
            self._insert_interaction(intr)

        self._insert_all_conflicts()

        input_fp.close()
        self.cur.close()
        self.conn.commit()
        self.conn.close()
예제 #5
0
def process_undirected(binary_file, output_fp, ontology, logfile_fp, counts):
    """
    Process all 'other' interactions - undirected physical interactions. Write
    consolidated interactions to output_fp.
    """
    input_fp = open(binary_file, 'rU')
    scanner = parse_mitab_file(input_fp, full_mitab_iterator)
    offsets_map = _consume_undirected_offsets_only(scanner)

    # Consolidate each interaction and write to file
    sorted_keys = sorted(offsets_map.iterkeys())
    for key in sorted_keys:
        scanner = parse_mitab_file(input_fp, partial_mitab_iterator,
                                   (offsets_map[key], ))
        pairs, _ = full_interaction_consumer(scanner)
        counts.pmids.update(intr.pmid for intr in pairs)
        counts.other += len(pairs)

        for new_intr in _consolidate_links(pairs, ontology, 'X', logfile_fp,
                                           counts):
            new_intr.to_file(output_fp)
            counts.undirected += 1
    input_fp.close()
예제 #6
0
def process_ppi1(irefindex_file,
                 id_logfile,
                 output_logfile,
                 biochem_file,
                 binary_file,
                 complexes_file,
                 obo_file,
                 biogrid_ptm_codes_file,
                 filtered_pmids_file=None,
                 accepted_taxids=None):

    counts = Phase1Counts()
    filtered_pmids = read_filtered_pmids(filtered_pmids_file)

    id_map = read_id_mapping(id_logfile)

    obo_fp = open(obo_file, 'rU')
    ontology = obo.OBOntology(obo_fp)

    biochem_filter = _BiochemFilter(ontology, biogrid_ptm_codes_file)
    complex_filter = _ComplexFilter(ontology)

    input_fp = open(irefindex_file, 'rU')
    removed_fp = NullFile()
    biochem_fp = open(biochem_file, 'w')
    binary_fp = open(binary_file, 'w')
    complex_fp = open(complexes_file, 'w')
    logfile_fp = open(output_logfile, 'w')

    output_fps = (removed_fp, binary_fp, complex_fp, biochem_fp)
    for fp in output_fps:
        Interaction.write_header(fp)

    scanner = parse_mitab_file(input_fp, full_mitab_iterator, None,
                               iRefIndexInteraction)
    for interaction, lines in scanner:
        line_numbers = lines[1]
        res = _process_interaction(interaction, id_map, filtered_pmids,
                                   logfile_fp, counts, line_numbers, ontology,
                                   biochem_filter, complex_filter,
                                   accepted_taxids)
        interaction.to_file(output_fps[res])

    counts.to_file(logfile_fp)
    input_fp.close()
    obo_fp.close()
    logfile_fp.close()

    for fp in output_fps:
        fp.close()
예제 #7
0
def summarize_complexes(ppiTrim_file, output_file):
    """
    Output a summary of generated complexes of 'A' and 'N' type.
    These are randomly shuffled for sampling.
    """

    lines = []
    input_fp = open(ppiTrim_file, 'rU')
    output_fp = open(output_file, 'w')

    scanner = parse_mitab_file(input_fp, full_mitab_iterator)

    for intr, _ in scanner:
        edgetype = intr.edgetype
        n = len(intr.interactors)
        if not intr.is_complex() or n > 20:
            continue
        if ('C' in edgetype or 'G' in edgetype or 'R' in edgetype):
            continue

        if not ('A' in edgetype):
            continue

        geneids = ', '.join(p.uid.acc for p in intr.interactors)
        symbols = ', '.join(p.alias.ids[0].acc for p in intr.interactors)

        for item in intr.confidence.ids:
            if item.db == 'sourcedbs':
                sourcedbs = item.acc
                break

        line = '\t'.join([
            intr.complex.uid.acc,
            '%d' % len(intr.interactors), edgetype, sourcedbs,
            '%d' % intr.pmid, symbols, geneids, '*****'
        ])
        lines.append(line)

    random.shuffle(lines)
    for line in lines:
        output_fp.write(line)
        output_fp.write('\n')

    input_fp.close()
    output_fp.close()
예제 #8
0
def extract_protein_ids(mitab_file, filtered_pmids, accepted_taxids):
    """
    Extract protein IDs from iRefIndex PSI-MI TAB 2.6 file
    """

    protein_ids = set()
    nullfile = NullFile()

    input_fp = open(mitab_file, 'rU')
    scanner = parse_mitab_file(input_fp, full_mitab_iterator, None,
                               iRefIndexInteraction)
    for interaction, lines in scanner:
        line_numbers = lines[1]
        if is_filtered(interaction, filtered_pmids, nullfile, line_numbers,
                       accepted_taxids):
            continue

        for p in interaction.interactors:
            # p stands for 'protein interactor'
            protein_ids.add(p.id)

    input_fp.close()
    return protein_ids