Пример #1
0
    def create_mapping_dict(self, filename, key_col=3, value_col=4):
        """Return a mapping dictionary for the provided file.

        This returns a dictionary for use in mapping nodes or edge types from
        the file specified by filetype. By default it opens the file specified
        by filename creates a dictionary using the first column as the key and
        the second column as the value.

        Args:
            filename(str): The name of the file containing the information
                needed to produce the maping dictionary.

        Returns:
            dict: A dictionary for use in mapping nodes or edge types.
        """
        term_map = dict()
        n_type = 'Property'
        n_meta_file = filename.replace('raw_line', 'node_meta')
        node_file = filename.replace('raw_line', 'node')
        orig_id, kn_id, orig_name, kn_name = ['', '', '', '']
        skip = True
        with open(filename) as infile, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            reader = csv.reader(infile, delimiter='\t')
            n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in reader:
                raw = line[3]
                if raw.startswith('[Term]'):
                    skip = False
                    orig_id, kn_id, orig_name, kn_name = ['', '', '', '']
                    continue
                if raw.startswith('[Typedef]'):
                    skip = True
                    continue
                if skip:
                    continue
                if raw.startswith('id: '):
                    orig_id = raw[4:].strip()
                    kn_id = cf.pretty_name(orig_id)
                    continue
                if raw.startswith('name: '):
                    orig_name = raw[6:].strip()
                    kn_name = cf.pretty_name('go_' + orig_name)
                    term_map[orig_id] = kn_id + '::' + kn_name
                    n_writer.writerow([kn_id, kn_name, n_type])
                    n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                    n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
                if raw.startswith('alt_id: '):
                    alt_id = raw[8:].strip()
                    term_map[alt_id] = kn_id + '::' + kn_name
                    n_meta_writer.writerow([kn_id, 'alt_alias', alt_id])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)

        return term_map
Пример #2
0
    def create_mapping_dict(self, filename, key_col=3, value_col=4):
        """Return a mapping dictionary for the provided file.

        This returns a dictionary for use in mapping nodes or edge types from
        the file specified by filetype. By default it opens the file specified
        by filename creates a dictionary using the key_col column as the key
        and the value_col column as the value.

        Args:
            filename (str): The name of the file containing the information
                needed to produce the maping dictionary.
            key_col (int): The column containing the key for creating the
                dictionary. By default this is column 3.
            value_col (int): The column containing the value for creating the
                dictionary. By default this is column 4.

        Returns:
            dict: A dictionary for use in mapping nodes or edge types.
        """
        src = filename.split('.')[0]
        alias = filename.split('.')[1]
        map_dict = dict()
        n_meta_file = filename.replace('raw_line', 'node_meta')
        node_file = filename.replace('raw_line', 'node')
        if not self.is_map(alias):
            return map_dict
        with open(filename, 'rb') as map_file, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            reader = csv.reader((line.decode('utf-8') for line in map_file),
                                delimiter='\t')
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in reader:
                chksm = line[2]
                orig_id = line[key_col].strip()
                orig_name = line[value_col].strip()
                kn_id = cf.pretty_name(orig_id)
                kn_name = cf.pretty_name(src + '_' + orig_name)
                map_dict[orig_id] = kn_id + '::' + kn_name
                n_writer.writerow([kn_id, kn_name])
                n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
        return map_dict
Пример #3
0
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')
        #e_meta_file = raw_line.replace('raw_line', 'edge_meta')

        #static column values
        n1type = 'property'
        n_type = 'Property'
        n2type = 'gene'
        n1hint = 'Pfam/Family'
        n2hint = 'Uniprot_gn'
        et_hint = 'pfam_prot'
        n1spec = '0'
        map_dict = dict()
        src = 'pf'

        ###Map the file name
        species = (os.path.join('..', '..', 'id_map', 'species',
                                'species.json'))
        with open(species) as infile:
            species_map = json.load(infile)
        n2spec = version_dict['alias']

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            edge_writer = csv.writer(edges,
                                     delimiter='\t',
                                     lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split()
                if len(line) == 1:
                    continue
                chksm = line[0]
                raw = line[3:]

                # skip commented lines
                comment_match = re.match('#', raw[0])
                if comment_match is not None:
                    continue

                orig_id = raw[5].strip()
                orig_name = raw[6].strip()
                kn_id = cf.pretty_name(src + '_' + orig_id)
                kn_name = cf.pretty_name(src + '_' + orig_name)
                map_dict[orig_id] = kn_id + '::' + kn_name
                n_writer.writerow([kn_id, kn_name, n_type])
                n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
                n2orig = raw[0]
                evalue = raw[12]
                evalue = float(evalue)
                score = self.sc_min
                if evalue == 0.0:
                    score = self.sc_max
                if evalue > 0.0:
                    score = round(-1.0 * math.log10(evalue), 4)
                if score > self.sc_max:
                    score = self.sc_max
                if score < self.sc_min:
                    continue

                output = [
                    chksm, kn_id, n1hint, n1type, n1spec, n2orig, n2hint,
                    n2type, n2spec, et_hint,
                    str(score)
                ]
                hasher = hashlib.md5()
                hasher.update('\t'.join(output).encode())
                t_chksum = hasher.hexdigest()
                edge_writer.writerow(output + [t_chksum])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
Пример #4
0
    def create_mapping_dict(self, filename, key_col=3, value_col=4):
        """Return a mapping dictionary for the provided file.

        This returns a dictionary for use in mapping nodes or edge types from
        the file specified by filetype. By default it opens the file specified
        by filename creates a dictionary using the first column as the key and
        the second column as the value.

        Args:
            filename(str): The name of the file containing the information
                needed to produce the maping dictionary.

        Returns:
            dict: A dictionary for use in mapping nodes or edge types.
        """
        src = filename.split('.')[0]
        alias = filename.split('.')[1]
        map_dict = dict()
        n1_type = 'Property'
        n_meta_file = filename.replace('raw_line', 'node_meta')
        node_file = filename.replace('raw_line', 'node')
        if not self.is_map(alias):
            return map_dict

        if alias == 'pathway':
            with open(filename, 'rb') as map_file, \
                open(n_meta_file, 'w') as n_meta, \
                open(node_file, 'w') as nfile:
                reader = csv.reader(
                    (line.decode('utf-8') for line in map_file),
                    delimiter='\t')
                n_meta_writer = csv.writer(n_meta,
                                           delimiter='\t',
                                           lineterminator='\n')
                n_writer = csv.writer(nfile,
                                      delimiter='\t',
                                      lineterminator='\n')
                for line in reader:
                    orig_id = line[3].strip()
                    orig_name = line[4].strip()
                    mod_id = src + '_' + orig_id.replace('map', '')
                    kn_id = cf.pretty_name(mod_id)
                    kn_name = cf.pretty_name(src + '_' + orig_name)
                    map_dict[orig_id] = kn_id + '::' + kn_name
                    n_writer.writerow([kn_id, kn_name, n1_type])
                    n_meta_writer.writerow([kn_id, 'orig_desc', orig_name])
                    n_meta_writer.writerow([kn_id, 'orig_id', orig_id])
            outfile = node_file.replace('node', 'unique.node')
            tu.csu(node_file, outfile)
            outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
            tu.csu(n_meta_file, outfile)

        else:
            with open(filename, 'rb') as map_file:
                reader = csv.reader(
                    (line.decode('utf-8') for line in map_file),
                    delimiter='\t')
                for line in reader:
                    orig_id = line[3].strip()
                    orig_name = line[4].strip()
                    mod_id = src + '_' + orig_id
                    kn_id = orig_name.split(':')[1]
                    kn_name = 'EntrezGene'
                    map_dict[mod_id] = kn_id + '::' + kn_name

        return map_dict
Пример #5
0
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')
        e_meta_file = raw_line.replace('raw_line', 'edge_meta')

        #static column values
        n1type = 'gene' #ignoring chemicals
        n1hint = 'UNIPROT_GN'
        n1spec = 'unknown'
        n2type = n1type #ignoring chemicals
        n2hint = n1hint
        n2spec = n1spec
        n3_type = 'property'
        n3hint = 'unknown'
        n3spec = 'unknown'
        score = '1'
        n_type = 'Property'

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(e_meta_file, 'w') as e_meta, \
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            edge_writer = csv.writer(edges, delimiter='\t', lineterminator='\n')
            e_meta_writer = csv.writer(e_meta, delimiter='\t', lineterminator='\n')
            n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                if line[1] == '1': #skip header
                    continue
                chksm = line[0]
                raw = line[3:]
                if len(raw) != 7: #extended information
                    continue
                (n1id, et_hint, n2id, src, publist, n3id, mediator_ids) = raw
                et_hint = 'pathcom_' + et_hint.replace('-', '_')
                #n1-n2 edge
                hasher = hashlib.md5()
                hasher.update('\t'.join([chksm, n1id, n1hint, n1type, n1spec,
                                         n2id, n2hint, n2type, n2spec, et_hint,
                                         score]).encode())
                t_chksum = hasher.hexdigest()
                edge_writer.writerow([chksm, n1id, n1hint, n1type, n1spec,
                                      n2id, n2hint, n2type, n2spec, et_hint,
                                      score, t_chksum])
                e_meta_writer.writerow([chksm, 'original_source', src])
                if publist:
                    e_meta_writer.writerow([chksm, 'reference', publist])
                #pathway edge
                if n3id:
                    kn_n3id = cf.pretty_name('paco_' + n3id)
                    n_writer.writerow([kn_n3id, kn_n3id, n_type])
                    n_meta_writer.writerow([kn_n3id, 'orig_id', n3id])
                    for node in [n1id, n2id]:
                        hasher = hashlib.md5()
                        hasher.update('\t'.join([chksm, kn_n3id, n3hint, n3_type,
                                                 n3spec, node, n1hint, n1type, n1spec,
                                                 'pathcom_pathway', score]).encode())
                        t_chksum = hasher.hexdigest()
                        edge_writer.writerow([chksm, kn_n3id, n3hint, n3_type,
                                              n3spec, node, n1hint, n1type, n1spec,
                                              'pathcom_pathway', score, t_chksum])
        outfile = e_meta_file.replace('edge_meta', 'unique.edge_meta')
        tu.csu(e_meta_file, outfile, [1, 2, 3])
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
Пример #6
0
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')

        #static column values
        alias = version_dict['alias']
        source = version_dict['source']
        mouse_aliases = ["MGI_Mammalian_Phenotype_2013", \
                         "MGI_Mammalian_Phenotype_Level_3",\
                         "MGI_Mammalian_Phenotype_Level_4", "Mouse_Gene_Atlas"]
        n1type = 'property'
        n_type = 'Property'
        n1spec = '0'
        n1hint = source + '_' + alias
        n2type = 'gene'
        if alias in mouse_aliases:
            n2spec = '10090'
            n2hint = 'MGI'
        else:
            n2spec = '9606'
            n2hint = 'HGNC'
        (et_hint, node_prefix) = self.aliases[alias].split('::')
        score = 1

        if alias == 'PPI_Hub_Proteins':
            n1type = 'gene'
            n1spec = '9606'
            n1hint = 'HGNC'


        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            edge_writer = csv.writer(edges,
                                     delimiter='\t',
                                     lineterminator='\n')
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                #line = re.split('\s{2,}', line)
                if len(line) == 1:
                    continue
                chksm = line[0]
                raw = line[3:]
                n1_orig_name = raw[0]
                n1_kn_name = n1_orig_name
                if alias != 'PPI_Hub_Proteins':
                    n1_kn_name = cf.pretty_name(node_prefix + '_' +
                                                n1_orig_name)
                    n_meta_writer.writerow(
                        [n1_kn_name, 'orig_desc', n1_orig_name])
                    n_writer.writerow([n1_kn_name, n1_kn_name, n_type])
                for n2_id in raw[1:]:
                    n2_id = n2_id.split(',')[0]
                    if n2_id == '':
                        continue
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1_kn_name, n1hint, n1type, n1spec,\
                        n2_id, n2hint, n2type, n2spec, et_hint,\
                        str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1_kn_name, n1hint, n1type, n1spec, \
                            n2_id, n2hint, n2type, n2spec, et_hint, score, \
                            t_chksum])

        if alias != 'PPI_Hub_Proteins':
            outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
            tu.csu(n_meta_file, outfile)
            outfile = node_file.replace('node', 'unique.node')
            tu.csu(node_file, outfile)
        else:
            os.remove(n_meta_file)
            os.remove(node_file)
Пример #7
0
    def table(self, raw_line, version_dict):
        """Uses the provided raw_line file to produce a 2table_edge file, an
        edge_meta file, a node and/or node_meta file (only for property nodes).

        This returns noting but produces the table formatted files from the
        provided raw_line file:
            raw_line (line_hash, line_num, file_id, raw_line)
            table_file (line_hash, n1name, n1hint, n1type, n1spec,
                     n2name, n2hint, n2type, n2spec, et_hint, score,
                     table_hash)
            edge_meta (line_hash, info_type, info_desc)
            node_meta (node_id,
                    info_type (evidence, relationship, experiment, or link),
                    info_desc (text))
            node (node_id, n_alias, n_type)

        Args:
            raw_line(str): The path to the raw_line file
            version_dict (dict): A dictionary describing the attributes of the
                alias for a source.

        Returns:
        """

        #outfiles
        table_file = raw_line.replace('raw_line', 'table')
        n_meta_file = raw_line.replace('raw_line', 'node_meta')
        node_file = raw_line.replace('raw_line', 'node')
        #e_meta_file = raw_line.replace('raw_line','edge_meta')

        #static column values
        alias = version_dict['alias']
        source = version_dict['source']
        n1type = 'property'
        n_type = 'Property'
        n1spec = '0'
        n1hint = source + '_' + alias
        n2type = 'gene'
        n2spec = '9606'  # assumption of human genes is occasionally incorrect
        n2hint = 'EntrezGene'
        et_hint = source + '_' + alias.replace(".", "_")
        score = 1

        with open(raw_line, encoding='utf-8') as infile, \
            open(table_file, 'w') as edges,\
            open(n_meta_file, 'w') as n_meta, \
            open(node_file, 'w') as nfile:
            edge_writer = csv.writer(edges,
                                     delimiter='\t',
                                     lineterminator='\n')
            n_meta_writer = csv.writer(n_meta,
                                       delimiter='\t',
                                       lineterminator='\n')
            n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n')
            for line in infile:
                line = line.replace('"', '').strip().split('\t')
                if len(line) == 1:
                    continue
                chksm = line[0]
                raw = line[3:]
                n1_orig_name = raw[0]
                n1_url = raw[1]
                hasher = hashlib.md5()
                hasher.update(n1_orig_name.encode())
                n1_chksum = hasher.hexdigest()
                n1_kn_id = cf.pretty_name('msig_' + n1_chksum)
                n1_kn_name = cf.pretty_name('msig_' + n1_orig_name)
                n1hint = n1_kn_name
                n_meta_writer.writerow([n1_kn_id, 'orig_desc', n1_orig_name])
                n_meta_writer.writerow([n1_kn_id, 'link', n1_url])
                n_writer.writerow([n1_kn_id, n1_kn_name, n_type])
                for n2_id in raw[2:]:
                    hasher = hashlib.md5()
                    hasher.update('\t'.join([chksm, n1_kn_id, n1hint, n1type, n1spec,\
                        n2_id, n2hint, n2type, n2spec, et_hint,\
                        str(score)]).encode())
                    t_chksum = hasher.hexdigest()
                    edge_writer.writerow([chksm, n1_kn_id, n1hint, n1type, n1spec, \
                            n2_id, n2hint, n2type, n2spec, et_hint, score, \
                            t_chksum])
        outfile = n_meta_file.replace('node_meta', 'unique.node_meta')
        tu.csu(n_meta_file, outfile)
        outfile = node_file.replace('node', 'unique.node')
        tu.csu(node_file, outfile)