Пример #1
0
def lambda_handler(event, context):
    
    g = graph.traversal().withRemote(DriverRemoteConnection(db, 'g')) ###!!!
    
    accession = event['biosample_id']
    record = xml.fromstring(
        requests.get(
            link.format(accession=accession, 
                       api_key=api_key)).text)
                       
    biosample = record.find('.//SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID').text                   
    
    for item in record.findall('.//Link/Id'):
        link_id = item.text
        record = xml.fromstring(
            requests.get(
                fetch.format(database='sra',
                             accession=accession, 
                             api_key=api_key)).text)
        for run in record.findall('.//RunSet/Run'):
            sra_accession = run.attrib['accession']
            if not list(g.E().hasLabel('NAMED_IN')
                         .filter(__.properties().values('name').is_(sra_accession))):
                #this one is new
                uri = f's3://edb/{biosample}/runs/{sra_accession}/'
                ebd_id = load_record(run, record, biosample, uri, g)
                sraq.send_message(MessageBody=json.dumps(dict(data=dict(accession=sra_accession,
                                                                        biosample=event.get('biosample', ''),
                                                                        bioproject=event.get('bioproject', '')
                                                                        s3Bucket=uri,
                                                                        edb_record_id = edb_id)),
                                                              results=dict()))
            
    return ''
Пример #2
0
def lambda_handler(event, context):
    """Get Biosamples that are part of the project and add them to the BioSample
       processing queue. Then add sub-projects to the BioProject queue."""
    print(event['bioproject'])
    recursive_depth = event.get('recursive_depth', 0)
    print(f"recursive depth {recursive_depth}")
    recursive_depth += 1
    record = xml.fromstring(
        requests.get(
            req.format(database="bioproject",
                       accession=event['bioproject'],
                       api_key=api_key)).text)
    for tag in record.findall('.//LocusTagPrefix'):
        try:
            biosample = tag.attrib['biosample_id']
        except KeyError:
            pass
        else:
            # bs_record = xml.fromstring(
            #     requests.get(req.format(database="biosample",
            #                             accession=biosample,
            #                             api_key=api_key)).text)
            # biosample = bs_record.find('''.//Id[@is_primary="1"]''')
            # bs_dict = {a.attrib['attribute_name']:a.text for a in bs_record.findall('.//Attribute')}
            graph = Graph()
            g = graph.traversal().withRemote(DriverRemoteConnection(
                db, 'g'))  ###!!!
            if not list(g.E().hasLabel('NAMED_IN').filter(
                    __.properties().values('name').is_(biosample))):
                #add the biosample to the queue
                bsq.send_message(MessageBody=json.dumps(
                    dict(biosample=biosample, bioproject=event['bioproject'])))
                print(biosample)
    if recursive_depth < max_recursion:
        #get all of the child bioprojects and dump them in the queue
        for link in (
                dict(recursive_depth=recursive_depth,
                     bioproject=link.attrib['accession'])
                for link in record.findall(".//ProjectLinks/Link/ProjectIDRef")
                if link.attrib['accession'].upper() !=
                event['bioproject'].upper()):
            bpq.send_message(MessageBody=json.dumps(link))
    def test_upsert_thrice(self) -> None:
        executor = mock.Mock(wraps=self.get_proxy().query_executor())

        # test that we will insert
        db_name = Fixtures.next_database()
        database_uri = f'database://{db_name}'
        vertex_type = VertexType(
            label=VertexTypes.Database.value.label,
            properties=VertexTypes.Database.value.properties +
            tuple([Property(name='foo', type=GremlinType.String)]))

        exists = self._get(label=vertex_type,
                           key=database_uri,
                           extra_traversal=__.count())
        self.assertEqual(exists, 0)
        _upsert(executor=executor,
                g=self.get_proxy().g,
                key_property_name=self.get_proxy().key_property_name,
                label=vertex_type,
                key=database_uri,
                name='test',
                foo='bar')
        exists = self._get(label=vertex_type,
                           key=database_uri,
                           extra_traversal=__.count())
        self.assertEqual(exists, 1)
        id = self._get(label=vertex_type,
                       key=database_uri,
                       extra_traversal=__.id())

        executor.reset_mock()
        _upsert(executor=executor,
                g=self.get_proxy().g,
                key_property_name=self.get_proxy().key_property_name,
                label=vertex_type,
                key=database_uri,
                name='test')
        exists = self._get(label=vertex_type,
                           key=database_uri,
                           extra_traversal=__.count())
        self.assertEqual(exists, 1)
        self.assertEqual(executor.call_count, 2)
        # first one is the get:
        self.assertEqual(executor.call_args_list[0][1]['query'].bytecode,
                         __.V(id).valueMap(True).bytecode)
        # the second one should be like
        self.assertEqual(executor.call_args_list[1][1]['query'].bytecode,
                         __.V(id).id().bytecode)

        executor.reset_mock()
        _upsert(executor=executor,
                g=self.get_proxy().g,
                key_property_name=self.get_proxy().key_property_name,
                label=vertex_type,
                key=database_uri,
                name='test2',
                foo=None)
        exists = self._get(label=vertex_type,
                           key=database_uri,
                           extra_traversal=__.count())
        self.assertEqual(exists, 1)
        self.assertEqual(executor.call_count, 2)
        # first one is the get:
        self.assertEqual(executor.call_args_list[0][1]['query'].bytecode,
                         __.V(id).valueMap(True).bytecode)
        # the second one should be like
        self.assertEqual(
            executor.call_args_list[1][1]['query'].bytecode,
            __.V(id).sideEffect(__.properties('foo').drop()).property(
                Cardinality.single, 'name', 'test2').id().bytecode)
Пример #4
0
def set_fields_routing_probs(graph_client: GremlinClient,
                             metrics_client: HeronMetricsClient,
                             topology_id: str, topology_ref: str,
                             start: dt.datetime, end: dt.datetime) -> None:
    """ Sets the routing probabilities for fields grouped logical connections
    in physical graph with the supplied topology ID and reference. Routing
    probabilities are calculated using metrics from the defined time window.

    Arguments:
        graph_client (GremlinClient):   The client instance for the graph
                                        database.
        metrics_client (HeronMetricsClient): The client instance for metrics
                                             database.
        topology_id (str):  The topology identification string.
        topology_ref (str): The topology reference string.
        start (dt.datetime):    The UTC datetime object for the start of the
                                metrics gathering widow.
        end (dt.datetime):  The UTC datetime object for the end of the metrics
                            gathering widow.
    """

    LOG.info(
        "Setting fields grouping routing probabilities for topology %s "
        "reference %s using metrics data from %s to %s", topology_id,
        topology_ref, start.isoformat(), end.isoformat())

    topology_traversal: GraphTraversalSource = \
        graph_client.topology_subgraph(topology_id, topology_ref)

    i_to_i_rps: pd.DataFrame = calculate_inter_instance_rps(
        metrics_client, topology_id, start, end)

    # Re-index the DataFrame to make selecting RPs faster
    i_to_i_rps.set_index(["source_task", "stream", "destination_task"],
                         inplace=True)

    # Get a list of all fields grouped connections in the physical graph
    fields_connections: List[Dict[str, Union[int, str, Edge]]] = \
        (topology_traversal.V()
         .outE("logically_connected")
         .has("grouping", "FIELDS")
         .project("source_task", "stream", "edge", "destination_task")
         .by(__.outV().properties("task_id").value())
         .by(__.properties("stream").value())
         .by()
         .by(__.inV().properties("task_id").value())
         .toList())

    LOG.debug(
        "Processing %d fields grouped connections for topology %s "
        "reference %s", len(fields_connections), topology_id, topology_ref)

    connection: Dict[str, Union[int, str, Edge]]
    for connection in fields_connections:

        LOG.debug("Processing connection from instance %d to %d on stream %s",
                  connection["source_task"], connection["destination_task"],
                  connection["stream"])

        routing_prob: float = (i_to_i_rps.loc[
            connection["source_task"], connection["stream"],
            connection["destination_task"]]["routing_probability"])

        (topology_traversal.E(connection["edge"]).property(
            "routing_probability", routing_prob).next())