def _parse_by_pmid(input_fp): # Will go smallest to largest pubmed id, in two passes # First pass: collect pubmed ids with pointers to their interactions scanner = parse_mitab_file(input_fp, full_mitab_iterator) pmids_map = offsets_by_pmid_consumer(scanner) sorted_pmids = sorted((len(v), k) for k, v in pmids_map.iteritems()) # Second pass for _, pmid in sorted_pmids: scanner = parse_mitab_file(input_fp, partial_mitab_iterator, (pmids_map[pmid], )) pairs, complexes = full_interaction_consumer(scanner) yield pmid, pairs, complexes
def process_biochem(biochem_file, output_fp, ontology, logfile_fp, counts): """ Process all candidate 'biochemical' interactions in biochem_file and write consolidated interactions to output_fp. """ input_fp = open(biochem_file, 'rU') scanner = parse_mitab_file(input_fp, full_mitab_iterator) pairs, complexes = full_interaction_consumer(scanner) counts.pmids.update(intr.pmid for intr in pairs) counts.pmids.update(intr.pmid for intr in complexes) # Extract additional interactions from 'complexes', which are in fact # biochemical reactions described in more detail pairs2, complexes2 = _extract_biochem_from_complexes(complexes) # Collect source interactions by (directed) pair of proteins plus pubmed id pmid_links, unassigned = _collect_by_pmid(pairs + pairs2, _get_biochem_key) # Find conflicts and inconsistencies _get_biochem_key_conflicts(pmid_links, logfile_fp) # Attempt to assign interactions from DIP, which are not directional unresolved = _collect_biochem_unassigned(unassigned, pmid_links) # Consolidate each interaction and write to file sorted_keys = sorted(pmid_links.keys()) for key in sorted_keys: interactions = pmid_links[key] counts.biochem += len(interactions) _correct_biogrid_biochem(interactions) for new_intr in _consolidate_links(interactions, ontology, 'D', logfile_fp, counts): new_intr.to_file(output_fp) counts.directed += 1 # Write all 'unused' interactions for item in unresolved: key = _get_undirected_key(item) for new_intr in _consolidate_links([item], ontology, 'B', logfile_fp, counts): new_intr.to_file(output_fp) counts.undirected += 1 # Write unused complexes for cmplx in complexes2: key = _get_undirected_key(cmplx) for new_intr in _consolidate_links([cmplx], ontology, 'C', logfile_fp, counts): new_intr.to_file(output_fp) counts.complexes += 1 input_fp.close()
def process_complexes(complexes_file, output_fp, ontology, logfile_fp, counts): """ Process all candidate 'complexes' in complexes_file and write consolidated interactions to output_fp. """ input_fp = open(complexes_file, 'rU') scanner = parse_mitab_file(input_fp, full_mitab_iterator) offsets_map = _consume_complex_offsets_only(scanner) # Consolidate each interaction and write to file sorted_keys = sorted(offsets_map.iterkeys()) for key in sorted_keys: scanner = parse_mitab_file(input_fp, partial_mitab_iterator, (offsets_map[key], )) _, complexes = full_interaction_consumer(scanner) counts.pmids.update(intr.pmid for intr in complexes) for new_intr in _consolidate_complexes(complexes, ontology, logfile_fp): new_intr.to_file(output_fp) counts.complexes += 1 input_fp.close()
def __call__(self, ppiTrim_file, *dbargs, **dbkwargs): self.connect_to_db(*dbargs, **dbkwargs) input_fp = open(ppiTrim_file, 'rU') scanner = parse_mitab_file(input_fp, full_mitab_iterator) for intr, _ in scanner: self._insert_interaction(intr) self._insert_all_conflicts() input_fp.close() self.cur.close() self.conn.commit() self.conn.close()
def process_undirected(binary_file, output_fp, ontology, logfile_fp, counts): """ Process all 'other' interactions - undirected physical interactions. Write consolidated interactions to output_fp. """ input_fp = open(binary_file, 'rU') scanner = parse_mitab_file(input_fp, full_mitab_iterator) offsets_map = _consume_undirected_offsets_only(scanner) # Consolidate each interaction and write to file sorted_keys = sorted(offsets_map.iterkeys()) for key in sorted_keys: scanner = parse_mitab_file(input_fp, partial_mitab_iterator, (offsets_map[key], )) pairs, _ = full_interaction_consumer(scanner) counts.pmids.update(intr.pmid for intr in pairs) counts.other += len(pairs) for new_intr in _consolidate_links(pairs, ontology, 'X', logfile_fp, counts): new_intr.to_file(output_fp) counts.undirected += 1 input_fp.close()
def process_ppi1(irefindex_file, id_logfile, output_logfile, biochem_file, binary_file, complexes_file, obo_file, biogrid_ptm_codes_file, filtered_pmids_file=None, accepted_taxids=None): counts = Phase1Counts() filtered_pmids = read_filtered_pmids(filtered_pmids_file) id_map = read_id_mapping(id_logfile) obo_fp = open(obo_file, 'rU') ontology = obo.OBOntology(obo_fp) biochem_filter = _BiochemFilter(ontology, biogrid_ptm_codes_file) complex_filter = _ComplexFilter(ontology) input_fp = open(irefindex_file, 'rU') removed_fp = NullFile() biochem_fp = open(biochem_file, 'w') binary_fp = open(binary_file, 'w') complex_fp = open(complexes_file, 'w') logfile_fp = open(output_logfile, 'w') output_fps = (removed_fp, binary_fp, complex_fp, biochem_fp) for fp in output_fps: Interaction.write_header(fp) scanner = parse_mitab_file(input_fp, full_mitab_iterator, None, iRefIndexInteraction) for interaction, lines in scanner: line_numbers = lines[1] res = _process_interaction(interaction, id_map, filtered_pmids, logfile_fp, counts, line_numbers, ontology, biochem_filter, complex_filter, accepted_taxids) interaction.to_file(output_fps[res]) counts.to_file(logfile_fp) input_fp.close() obo_fp.close() logfile_fp.close() for fp in output_fps: fp.close()
def summarize_complexes(ppiTrim_file, output_file): """ Output a summary of generated complexes of 'A' and 'N' type. These are randomly shuffled for sampling. """ lines = [] input_fp = open(ppiTrim_file, 'rU') output_fp = open(output_file, 'w') scanner = parse_mitab_file(input_fp, full_mitab_iterator) for intr, _ in scanner: edgetype = intr.edgetype n = len(intr.interactors) if not intr.is_complex() or n > 20: continue if ('C' in edgetype or 'G' in edgetype or 'R' in edgetype): continue if not ('A' in edgetype): continue geneids = ', '.join(p.uid.acc for p in intr.interactors) symbols = ', '.join(p.alias.ids[0].acc for p in intr.interactors) for item in intr.confidence.ids: if item.db == 'sourcedbs': sourcedbs = item.acc break line = '\t'.join([ intr.complex.uid.acc, '%d' % len(intr.interactors), edgetype, sourcedbs, '%d' % intr.pmid, symbols, geneids, '*****' ]) lines.append(line) random.shuffle(lines) for line in lines: output_fp.write(line) output_fp.write('\n') input_fp.close() output_fp.close()
def extract_protein_ids(mitab_file, filtered_pmids, accepted_taxids): """ Extract protein IDs from iRefIndex PSI-MI TAB 2.6 file """ protein_ids = set() nullfile = NullFile() input_fp = open(mitab_file, 'rU') scanner = parse_mitab_file(input_fp, full_mitab_iterator, None, iRefIndexInteraction) for interaction, lines in scanner: line_numbers = lines[1] if is_filtered(interaction, filtered_pmids, nullfile, line_numbers, accepted_taxids): continue for p in interaction.interactors: # p stands for 'protein interactor' protein_ids.add(p.id) input_fp.close() return protein_ids