示例#1
0
    def __init__(self):

        super(NcbiGeneParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # output data
        # both gene IDs and GeneSymbols have the label 'Gene'
        # two different NodeSets are used because only the GeneSymbol nodes need taxid for uniqueness
        self.genes = NodeSet(['Gene'],
                             merge_keys=['sid'],
                             default_props={'source': 'ncbigene'})
        self.genesymbols = NodeSet(['Gene'],
                                   merge_keys=['sid', 'taxid'],
                                   default_props={
                                       'source': 'ncbigene',
                                       'type': 'symbol'
                                   })
        self.genesymbol_synonym_genesymbol = RelationshipSet(
            'SYNONYM', ['Gene'], ['Gene'], ['sid', 'taxid'], ['sid', 'taxid'],
            default_props={'source': 'ncbigene'})
        self.gene_maps_genesymbol = RelationshipSet(
            'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid', 'taxid'],
            default_props={'source': 'ncbigene'})
示例#2
0
    def __init__(self):
        """

        :param mesh_instance: NcbiGene Instance
        :type mesh_instance: DataSourceInstance
        """
        super(GtexMetadataParser, self).__init__()

        # NodeSets
        self.tissues = NodeSet(['GtexTissue'], merge_keys=['name'])
        self.detailed_tissues = NodeSet(['GtexDetailedTissue'],
                                        merge_keys=['name'])
        self.sample = NodeSet(['GtexSample'], merge_keys=['sid'])

        self.sample_measures_tissue = RelationshipSet('MEASURES',
                                                      ['GtexSample'],
                                                      ['GtexTissue'], ['sid'],
                                                      ['name'])
        self.sample_measures_detailed_tissue = RelationshipSet(
            'MEASURES', ['GtexSample'], ['GtexDetailedTissue'], ['sid'],
            ['name'])
        self.tissue_parent_detailed_tissue = RelationshipSet(
            'PARENT', ['GtexTissue'], ['GtexDetailedTissue'], ['name'],
            ['name'])
        self.tissue_parent_detailed_tissue.unique = True
示例#3
0
    def __init__(self):
        """
        :param ensembl_instance: The ENSEMBL DataSource instance.
        """
        super(EnsemblEntityParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSets
        self.genes = NodeSet(['Gene'],
                             merge_keys=['sid'],
                             default_props={'source': 'ensembl'})
        self.transcripts = NodeSet(['Transcript'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'ensembl'})
        self.proteins = NodeSet(['Protein'],
                                merge_keys=['sid'],
                                default_props={'source': 'ensembl'})

        # RelationshipSets
        self.gene_codes_transcript = RelationshipSet(
            'CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.transcript_codes_protein = RelationshipSet(
            'CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
示例#4
0
    def __init__(self):
        super(LncipediaParser, self).__init__()

        self.genes = NodeSet(['Gene'], merge_keys=['sid'])
        self.transcripts = NodeSet(['Transcript'], merge_keys=['sid'])
        self.gene_codes_transcripts = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'])
        self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['sid'], ['sid'])
        self.transcript_maps_transcript = RelationshipSet('MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'])
示例#5
0
    def __init__(self):

        super(SwissLipidsParser, self).__init__()

        # define NodeSet and RelationshipSet
        self.lipids = NodeSet(['Lipid'], merge_keys=['sid'])

        self.lipid_fromclass_lipid = RelationshipSet('FROM_LIPID_CLASS', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_parent_lipid = RelationshipSet('HAS_PARENT', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_component_lipid = RelationshipSet('HAS_COMPONENT', ['Lipid'], ['Lipid'], ['sid'], ['sid'])
        self.lipid_maps_metabolite = RelationshipSet('MAPS', ['Lipid'], ['Metabolite'], ['sid'], ['sid'])
        self.lipid_associates_protein = RelationshipSet('HAS_ASSOCIATION', ['Lipid'], ['Protein'], ['sid'], ['sid'])
示例#6
0
    def __init__(self):
        """
        :param refseq_instance: The RefSeq DataSource instance.
        """
        super(RefseqCodesParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # define NodeSet and RelationshipSet
        self.gene_codes_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'refseq'})
示例#7
0
def load_wpp_data(base_path, graph):
    """
    Load UN population data.

    :param base_path: Path where file was downloaded.
    """
    un_wpp_csv_file = os.path.join(base_path, 'WPP2019_PopulationByAgeSex_Medium.csv')
    log.info('Parse UN population data file: {}'.format(un_wpp_csv_file))

    country = NodeSet(['Country'], ['name'])
    age_group_nodes = NodeSet(['AgeGroup'], ['group'])
    country_total_group = RelationshipSet('CURRENT_TOTAL', ['Country'], ['AgeGroup'], ['name'], ['group'])
    country_male_group = RelationshipSet('CURRENT_MALE', ['Country'], ['AgeGroup'], ['name'], ['group'])
    country_female_group = RelationshipSet('CURRENT_FEMALE', ['Country'], ['AgeGroup'], ['name'], ['group'])

    countries_added = set()
    age_groups_added = set()

    with open(un_wpp_csv_file, 'rt') as f:
        csv_file = csv.reader(f, delimiter=',', quotechar='"')
        # skip header
        next(csv_file)
        for row in csv_file:
            # LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,PopMale,PopFemale,PopTotal
            loc_id = row[0]
            location = row[1]
            time = int(row[4])
            age_group = row[6]
            age_group_start = int(row[7])
            age_group_span = row[8]
            pop_male = int(float((row[9])) * 1000)
            pop_female = int(float((row[10])) * 1000)
            pop_total = int(float((row[11])) * 1000)

            # only take 2019
            if time == 2019:
                if location not in countries_added:
                    country.add_node({'name': location, 'un_id': loc_id})
                    countries_added.add(location)
                if age_group not in age_groups_added:
                    age_group_nodes.add_node({'group': age_group, 'start': age_group_start, 'span': age_group_span})

                country_total_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_total})
                country_male_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_male})
                country_female_group.add_relationship({'name': location}, {'group': age_group}, {'count': pop_female})

    log.info('Load data to Neo4j')
    country.merge(graph)
    age_group_nodes.merge(graph)
    country_total_group.merge(graph)
    country_male_group.merge(graph)
    country_female_group.merge(graph)
示例#8
0
def read_daily_report_data_csv_JHU(file):
    """
    Extract data from a single daile report file from JHU.

    :param file: Path to the CSV file
    :return:
    """
    log.info('Read JHU CSV file {}'.format(file))

    countries = NodeSet(['Country'], ['name'])
    provinces = NodeSet(['Province'], ['name'])
    updates = NodeSet(['DailyReport'], ['uuid'])
    province_in_country = RelationshipSet('PART_OF', ['Province'], ['Country'], ['name'], ['name'])
    province_in_country.unique = True
    province_rep_update = RelationshipSet('REPORTED', ['Province'], ['DailyReport'], ['name'], ['uuid'])

    with open(file, 'rt') as csvfile:
        rows = csv.reader(csvfile, delimiter=',', quotechar='"')
        # skip header
        next(rows)

        for row in rows:
            country = row[1]
            province = row[0]
            # if no name for province, use country name
            if not province:
                province = '{}_complete'.format(country)

            date = parse(row[2])
            uuid = country+province+str(date)
            confirmed = int(row[3]) if row[3] else 'na'
            death = int(row[4]) if row[4] else 'na'
            recovered = int(row[5]) if row[5] else 'na'

            lat = row[6] if len(row) >= 7 else None
            long = row[7] if len(row) >= 8 else None

            province_dict = {'name': province}
            if lat and long:
                province_dict['latitude'] = lat
                province_dict['longitude'] = long
            provinces.add_unique(province_dict)

            countries.add_unique({'name': country})

            updates.add_unique(
                {'date': date, 'confirmed': confirmed, 'death': death, 'recovered': recovered, 'uuid': uuid})

            province_in_country.add_relationship({'name': province}, {'name': country}, {'source': 'jhu'})
            province_rep_update.add_relationship({'name': province}, {'uuid': uuid}, {'source': 'jhu'})

    return countries, provinces, updates, province_in_country, province_rep_update
示例#9
0
    def __init__(self):
        super(HmdbParser, self).__init__()

        # NodeSets
        self.metabolites = NodeSet(['Metabolite'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'hmdb'})

        self.metabolite_map_metabolite = RelationshipSet(
            'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'hmdb'})
        self.metabolite_associates_protein = RelationshipSet(
            'HAS_ASSOCIATION', ['Metabolite'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'hmdb'})
示例#10
0
    def __init__(self):
        super(RefseqRemovedRecordsParser, self).__init__()

        self.arguments = ['taxid']

        self.legacy_ids = set()

        self.legacy_transcripts = NodeSet(['Transcript', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'})
        self.legacy_transcript_now_transcript = RelationshipSet('REPLACED_BY', ['Transcript'], ['Transcript'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.legacy_proteins = NodeSet(['Protein', 'Legacy'], merge_keys=['sid'], default_props={'source': 'refseq'})
        self.legacy_protein_now_protein = RelationshipSet('REPLACED_BY', ['Protein'], ['Protein'],
                                                                ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.gene_codes_legacy_transcript = RelationshipSet('CODES', ['Gene'], ['Transcript', 'Legacy'], ['sid'], ['sid'], default_props={'source': 'refseq'})
        self.legacy_transcript_codes_protein = RelationshipSet('CODES', ['Transcript', 'Legacy'], ['Protein'],
                                                               ['sid'], ['sid'], default_props={'source': 'refseq'})
示例#11
0
    def __init__(self):
        super(ChebiParser, self).__init__()

        # NodeSets
        self.metabolites = NodeSet(['Metabolite'],
                                   merge_keys=['sid'],
                                   default_props={'source': 'chebi'})
        self.metabolite_isa_metabolite = RelationshipSet(
            'IS_A', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_rel_metabolite = RelationshipSet(
            'CHEBI_REL', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
        self.metabolite_maps_metabolite = RelationshipSet(
            'MAPS', ['Metabolite'], ['Metabolite'], ['sid'], ['sid'],
            default_props={'source': 'chebi'})
示例#12
0
    def __init__(self):
        super(MirbaseParser, self).__init__()

        # NodeSets
        self.precursor_mirna = NodeSet(['PrecursorMirna'], merge_keys=['sid'])
        self.mature_mirna = NodeSet(['Mirna'], merge_keys=['sid'])
        # RelationshipSets
        self.precursor_codes_mature = RelationshipSet('PRE',
                                                      ['PrecursorMirna'],
                                                      ['Mirna'], ['sid'],
                                                      ['sid'])
        self.transcript_codes_precursor = RelationshipSet(
            'IS', ['Transcript'], ['PrecursorMirna'], ['sid'], ['sid'])
        self.gene_is_precursor = RelationshipSet('IS', ['Gene'],
                                                 ['PrecursorMirna'], ['sid'],
                                                 ['sid'])
示例#13
0
    def __init__(self):
        """
        """
        super(MirtarbaseParser, self).__init__()

        # RelationshipSets
        self.mirna_targets_gene = RelationshipSet('TARGETS', ['Mirna'], ['Gene'], ['name'], ['sid'])
示例#14
0
    def __init__(self):
        super(SomeParser, self).__init__()

        self.source = NodeSet(['Source'], merge_keys=['source_id'])
        self.target = NodeSet(['Target'], merge_keys=['target_id'])
        self.rels = RelationshipSet('FOO', ['Source'], ['Target'],
                                    ['source_id'], ['target_id'])
示例#15
0
    def __init__(self):
        super(GeneOntologyAssociationParser, self).__init__()

        self.arguments = ['taxid']

        # RelationshipSets
        self.protein_associates_goterm = RelationshipSet(
            'ASSOCIATION', ['Protein'], ['Term'], ['sid'], ['sid'])
示例#16
0
    def __init__(self):

        super(EnsemblMappingParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # define NodeSet and RelationshipSet
        self.gene_maps_gene = RelationshipSet(
            'MAPS', ['Gene'], ['Gene'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.transcript_maps_transcript = RelationshipSet(
            'MAPS', ['Transcript'], ['Transcript'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
        self.protein_maps_protein = RelationshipSet(
            'MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'],
            default_props={'source': 'ensembl'})
示例#17
0
    def __init__(self):
        """

        :param ncbigene_instance: NcbiGene Instance
        :type ncbigene_instance: DataSourceInstance
        :param taxid:
        """
        super(HGNCParser, self).__init__()

        # output data
        self.genes = NodeSet(['Gene'], merge_keys=['sid'])

        self.gene_maps_gene = RelationshipSet('MAPS', ['Gene'], ['Gene'],
                                              ['sid'], ['sid'])
        self.gene_maps_genesymbol = RelationshipSet('MAPS', ['Gene'],
                                                    ['GeneSymbol'], ['sid'],
                                                    ['sid', 'taxid'])
示例#18
0
    def __init__(self):
        """
        :param uniprot_instance: The Uniprot instance
        :param taxid: The taxid
        """
        super(UniprotKnowledgebaseParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # NodeSet
        self.proteins = NodeSet(['Protein'], merge_keys=['sid'], default_props={'source': 'uniprot'})

        # RelationshipSet
        self.protein_primary_protein = RelationshipSet('PRIMARY', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
        self.transcript_codes_protein = RelationshipSet('CODES', ['Transcript'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
        self.protein_maps_protein = RelationshipSet('MAPS', ['Protein'], ['Protein'], ['sid'], ['sid'], default_props={'source': 'uniprot'})
示例#19
0
    def __init__(self):
        super(MirdbParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # RelationshipSets
        self.mirna_targets_transcript = RelationshipSet(
            'TARGETS', ['Mirna'], ['Transcript'], ['name'], ['sid'])
示例#20
0
    def __init__(self):
        super(MeshParser, self).__init__()

        # NodeSets
        self.descriptor = NodeSet(['MeshDescriptor'], merge_keys=['sid'])
        self.qualifier = NodeSet(['MeshQualifier'], merge_keys=['sid'])
        self.concept = NodeSet(['MeshConcept'], merge_keys=['sid'])
        self.term = NodeSet(['MeshTerm'], merge_keys=['sid'])

        self.descriptor_allowed_qualifier = RelationshipSet('ALLOWED', ['MeshDescriptor'], ['MeshQualifier'], ['sid'],
                                                            ['sid'])

        self.descriptor_has_concept = RelationshipSet('HAS', ['MeshDescriptor'], ['MeshConcept'], ['sid'], ['sid'])
        self.descriptor_has_concept.unique = True
        self.concept_has_term = RelationshipSet('HAS', ['MeshConcept'], ['MeshTerm'], ['sid'], ['sid'])
        self.concept_has_term.unique = True
        self.concept_related_concept = RelationshipSet('RELATED', ['MeshConcept'], ['MeshConcept'], ['sid'], ['sid'])
        self.concept_related_concept.unique = True
示例#21
0
    def __init__(self):
        super(NcbiLegacyGeneParser, self).__init__()

        self.arguments = ['taxid']

        self.legacy_genes = NodeSet(['Gene', 'Legacy'],
                                    merge_keys=['sid'],
                                    default_props={'source': 'ncbigene'})
        self.legacy_gene_now_gene = RelationshipSet(
            'REPLACED_BY', ['Gene', 'Legacy'], ['Gene'], ['sid'], ['sid'],
            default_props={'source': 'ncbigene'})
示例#22
0
    def __init__(self):
        super(DummyParser, self).__init__()

        # arguments
        self.arguments = ['taxid']

        # output data
        self.dummy_nodes = NodeSet(['Dummy'], merge_keys=['sid'])
        self.fummy_nodes = NodeSet(['Fummy'], merge_keys=['sid'])

        self.dummy_knows_fummy = RelationshipSet('KNOWS', ['Dummy'], ['Fummy'],
                                                 ['sid'], ['sid'])
示例#23
0
    def __init__(self):
        """

        :param ncbigene_instance: NcbiGene Instance
        :type ncbigene_instance: DataSourceInstance
        :param taxid:
        """
        super(NcbiGeneOrthologParser, self).__init__()

        self.gene_ortholog_gene = RelationshipSet('ORTHOLOG', ['Gene'],
                                                  ['Gene'], ['sid'], ['sid'])

        self.object_sets = [self.gene_ortholog_gene]
        self.container.add_all(self.object_sets)
示例#24
0
    def __init__(self):
        """

        :param mesh_instance: NcbiGene Instance
        :type mesh_instance: DataSourceInstance
        """
        super(GtexDataParser, self).__init__()

        self.gene_expressed_tissue = RelationshipSet('EXPRESSED', ['Gene'],
                                                     ['GtexDetailedTissue'],
                                                     ['sid'], ['name'])

        self.object_sets = [self.gene_expressed_tissue]

        self.container.add_all(self.object_sets)
    def _define_node_and_relatinship_sets(self):
        # Define nodesets
        nodeSets = {}
        relSets = {}
        nodeSets["Papers"] = NodeSet(["Paper"], ["paper_id"])
        nodeSets["PaperIDHubs"] = NodeSet(
            [self.id_node_label, config.JSON2GRAPH_COLLECTION_NODE_LABEL],
            ["id"])
        nodeSets["Metadata"] = NodeSet(["Metadata"], ["_hash_id"])
        nodeSets["Authors"] = NodeSet(["Author"], ["_hash_id"])
        nodeSets["AuthorHubs"] = NodeSet(
            ["Author", config.JSON2GRAPH_COLLECTION_NODE_LABEL], ["id"])
        nodeSets["Abstracts"] = NodeSet(["Abstract"], ["_hash_id"])
        nodeSets["AbstractHubs"] = NodeSet(
            ["Abstract", config.JSON2GRAPH_COLLECTION_NODE_LABEL], ["id"])

        relSets["PAPER_HAS_PAPERID_COLLECTION"] = RelationshipSet(
            rel_type="PAPER_HAS_PAPERID_COLLECTION",
            start_node_labels=["Paper"],
            end_node_labels=[
                self.id_node_label,
                config.JSON2GRAPH_COLLECTION_NODE_LABEL,
            ],
            start_node_properties=["paper_id"],
            end_node_properties=["id"],
        )

        relSets["PAPER_HAS_METADATA"] = RelationshipSet(
            rel_type="PAPER_HAS_METADATA",
            start_node_labels=["Paper"],
            end_node_labels=["Metadata"],
            start_node_properties=["paper_id"],
            end_node_properties=["_hash_id"],
        )

        relSets["METADATA_HAS_AUTHORHUB"] = RelationshipSet(
            rel_type="METADATA_HAS_AUTHOR",
            start_node_labels=["Metadata"],
            end_node_labels=[
                "Author", config.JSON2GRAPH_COLLECTION_NODE_LABEL
            ],
            start_node_properties=["_hash_id"],
            end_node_properties=["id"],
        )
        relSets["AUTHORHUB_HAS_AUTHOR"] = RelationshipSet(
            rel_type="AUTHOR_HAS_AUTHOR",
            start_node_labels=[
                "Author", config.JSON2GRAPH_COLLECTION_NODE_LABEL
            ],
            end_node_labels=["Author"],
            start_node_properties=["id"],
            end_node_properties=["_hash_id"],
        )

        relSets["PAPER_HAS_ABSTRACTHUB"] = RelationshipSet(
            rel_type="PAPER_HAS_ABSTRACT",
            start_node_labels=["Paper"],
            end_node_labels=[
                "Abstract", config.JSON2GRAPH_COLLECTION_NODE_LABEL
            ],
            start_node_properties=["paper_id"],
            end_node_properties=["id"],
        )
        relSets["ABSTRACTHUB_HAS_ABSTRACT"] = RelationshipSet(
            rel_type="ABSTRACT_HAS_ABSTRACT",
            start_node_labels=[
                "Abstract", config.JSON2GRAPH_COLLECTION_NODE_LABEL
            ],
            end_node_labels=["Abstract"],
            start_node_properties=["id"],
            end_node_properties=["_hash_id"],
        )

        # Define id nodes and relations
        for col_name, node_props in self.id_columns.items():
            nodeSets[node_props["label"]] = NodeSet(
                [self.id_node_label, node_props["label"]], ["id"])
            relSets[node_props["label"]] = RelationshipSet(
                rel_type="PAPERID_COLLECTION_HAS_PAPERID",
                start_node_labels=[
                    self.id_node_label,
                    config.JSON2GRAPH_COLLECTION_NODE_LABEL,
                ],
                end_node_labels=[self.id_node_label, node_props["label"]],
                start_node_properties=["id"],
                end_node_properties=[node_props["attr"]],
            )

        self.nodeSets = nodeSets
        self.relSets = relSets
示例#26
0
def read_daily_report_data_csv_JHU(file):
    """
    Extract data from a single daile report file from JHU.

    Old format (until 03-21-2020)
        Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
    New format:
        FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key

    :param file: Path to the CSV file
    :return:
    """
    log.info('Read JHU CSV file {}'.format(file))
    # understand if old fromat (

    countries = NodeSet(['Country'], ['name'])
    provinces = NodeSet(['Province'], ['name'])
    updates = NodeSet(['DailyReport'], ['uuid'])
    province_in_country = RelationshipSet('PART_OF', ['Province'], ['Country'],
                                          ['name'], ['name'])
    province_in_country.unique = True
    province_rep_update = RelationshipSet('REPORTED', ['Province'],
                                          ['DailyReport'], ['name'], ['uuid'])

    with open(file, 'rt') as csvfile:
        rows = csv.reader(csvfile, delimiter=',', quotechar='"')
        # skip header
        header = next(rows)
        if len(header) > 8:
            file_type = 'new'
        else:
            file_type = 'old'
        log.info("File type: {}".format(file_type))

        for row in rows:

            if file_type == 'old':
                country, province, date, confirmed, death, recovered, lat, long = parse_jhu_old_file_row(
                    row)
            elif file_type == 'new':
                country, province, date, confirmed, death, recovered, lat, long = parse_jhu_new_file_row(
                    row)

            province_dict = {'name': province}
            if lat and long:
                province_dict['latitude'] = lat
                province_dict['longitude'] = long

            uuid = country + province + str(date)

            provinces.add_unique(province_dict)

            countries.add_unique({'name': country})

            updates.add_unique({
                'date': date,
                'confirmed': confirmed,
                'death': death,
                'recovered': recovered,
                'uuid': uuid
            })

            province_in_country.add_relationship({'name': province},
                                                 {'name': country},
                                                 {'source': 'jhu'})
            province_rep_update.add_relationship({'name': province},
                                                 {'uuid': uuid},
                                                 {'source': 'jhu'})

    return countries, provinces, updates, province_in_country, province_rep_update
示例#27
0
graph = py2neo.Graph(host=NEO4J_HOST, user=NEO4J_USER, password=NEO4J_PASSWORD)
graph.run("MATCH (a) RETURN a LIMIT 1")

# Download file from NCBI FTP Server
print('Download file from NCBI FTP server')
with urlopen(
        'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
) as r:
    with open(DOWNLOAD_FILE_PATH, 'wb') as f:
        shutil.copyfileobj(r, f)

# define NodeSet and RelationshipSet
ncbi_gene_nodes = NodeSet(['Gene'], ['gene_id'])
ensembl_gene_nodes = NodeSet(['Gene'], ['gene_id'])
gene_mapping_rels = RelationshipSet('MAPS', ['Gene'], ['Gene'], ['gene_id'],
                                    ['gene_id'])

# iterate the data file and extract nodes/relationships
print('Iterate file and create nodes/relationships')
# collect mapped ENSEMBL gene IDs to avoid duplicate genes
ensembl_gene_ids_added = set()

with gzip.open(DOWNLOAD_FILE_PATH, 'rt') as file:
    # skip header line
    next(file)
    # iterate file
    for line in file:
        fields = line.strip().split('\t')
        ncbi_gene_id = fields[1]

        # get mapping to ENSEMBL Gene IDs
    def _create_relation(self,
                         parent_node: Node,
                         child_node: Node,
                         relation_props={},
                         relationshipset_identifier=None):

        if parent_node is None or child_node is None:
            return None
        # labels = ":".join(parent_node.labels) + "|" + ":".join(child_node.labels)

        if relationshipset_identifier == None:
            relationshipset_identifier = (
                frozenset(parent_node.labels),
                frozenset(child_node.labels),
            )

        if (hasattr(parent_node, "override_reltype") and
                child_node.__primarylabel__ in parent_node.override_reltype):
            relationshipset_identifier = (
                frozenset(parent_node.labels),
                frozenset(child_node.labels),
                frozenset(
                    parent_node.override_reltype[child_node.__primarylabel__]),
            )

        # Create new relationshipset if necessary
        if not relationshipset_identifier in self.relationshipSets:
            rel_name = None
            if callable(self.config_func_custom_relation_name_generator):
                rel_name = self.config_func_custom_relation_name_generator(
                    parent_node, child_node, relation_props)
            if rel_name is None:
                child_node_name = child_node.__primarylabel__.upper()
                parent_node_name = parent_node.__primarylabel__.upper()
                rel_name = "{}_HAS_{}".format(
                    parent_node_name,
                    child_node_name,
                )
                if hasattr(parent_node, "override_reltype"):
                    if child_node.__primarylabel__ in parent_node.override_reltype:
                        rel_name = parent_node.override_reltype[
                            child_node.__primarylabel__].upper()

                if rel_name in self.config_dict_reltype_override:
                    rel_name = self.config_dict_reltype_override[rel_name]
            if rel_name in self.config_list_drop_reltypes:
                self._blocked_reltypes.append(relationshipset_identifier)
            else:
                self.relationshipSets[
                    relationshipset_identifier] = RelationshipSet(
                        rel_type=rel_name,
                        start_node_labels=frozenset(parent_node.labels),
                        end_node_labels=frozenset(child_node.labels),
                        start_node_properties=self._get_merge_keys(
                            parent_node),
                        end_node_properties=self._get_merge_keys(child_node),
                    )
        # add relationship to set if not blocked by caller config
        if not relationshipset_identifier in self._blocked_reltypes:
            self.relationshipSets[relationshipset_identifier].add_relationship(
                start_node_properties={
                    key: val
                    for key, val in dict(parent_node).items()
                    if key in self._get_merge_keys(parent_node)
                },
                end_node_properties={
                    key: val
                    for key, val in dict(child_node).items()
                    if key in self._get_merge_keys(child_node)
                },
                properties=relation_props,
            )
示例#29
0
    def __init__(self):
        super(NcbiHomoloGeneParser, self).__init__()

        # output data
        self.gene_homolog_gene = RelationshipSet('HOMOLOG', ['Gene'], ['Gene'],
                                                 ['sid'], ['sid'])
 def __init__(self):
     super(DependingTestParser, self).__init__()
     self.rels = RelationshipSet('FOO', ['Source'], ['Target'],
                                 ['source_id'], ['target_id'])