예제 #1
0
def main():
    gql_client = GraphQLClient()
    df = pd.read_csv(args.file)

    if args.delete:
        gql_client.bulk_unlink(from_ids=df['from_id'], to_ids=df['to_id'])
        LOGGER.info(f'deleted {len(df)} relationships')
    else:
        gql_client.bulk_link(from_ids=df['from_id'], to_ids=df['to_id'])
        LOGGER.info(f'merged {len(df)} relationships')
예제 #2
0
def main():
    gql_client = GraphQLClient()
    es_client = ElasticsearchClient()

    news_list = gql_client.get_all_news(
        fields=['id', 'title', 'published_at', 'is_timeline'],
        start_date=args.start_date,
        end_date=args.end_date)
    LOGGER.info(f'fetched {len(news_list)} news from GraphQL')

    if args.check_timeline:
        news_list = list(filter(lambda x: x.is_timeline, news_list))
        LOGGER.info(f'filtered {len(news_list)} timeline news')

    stats = defaultdict(int)
    for news in tqdm(news_list):
        LOGGER.info(f'process {news.id}')
        stats['process'] += 1
        try:
            news_text = es_client.get(news.id)
            if not args.skip_minutes:
                LOGGER.debug(f'check Minutes for {news.id}')
                minutes_list = fetch_matched_minutes(news, news_text)
                if minutes_list:
                    gql_client.bulk_link([news.id] * len(minutes_list),
                                         map(lambda x: x['id'], minutes_list))
                    LOGGER.info(
                        f'linked {len(minutes_list)} minutes for {news.id}')
            if not args.skip_bill:
                LOGGER.debug(f'check Bill for {news.id}')
                bill_list = fetch_matched_bills(news, news_text)
                if bill_list:
                    gql_client.bulk_link([news.id] * len(bill_list),
                                         map(lambda x: x['id'], bill_list))
                    LOGGER.info(f'linked {len(bill_list)} bills for {news.id}')
            if not args.skip_timeline:
                LOGGER.debug(f'check Timeline for {news.id}')
                is_timeline = fetch_is_timeline(news, news_text)
                if is_timeline:
                    # need to create new instance to avoid neo4j datetime error
                    updated_news = News(None)
                    updated_news.id = news.id
                    updated_news.is_timeline = is_timeline
                    gql_client.merge(updated_news)
                    LOGGER.info(f'linked {news.id} to timeline')
        except Exception as e:
            stats['fail'] += 1
            if isinstance(e, json.decoder.JSONDecodeError):
                LOGGER.warning(f'failed to parse API response for {news.id}')
            else:
                LOGGER.exception(f'failed to process {news.id}')
    LOGGER.info('processed {} news ({} success, {} fail)'.format(
        stats['process'], stats['process'] - stats['fail'], stats['fail']))
예제 #3
0
    def test_bulk_link(self):
        client = GraphQLClient()

        url = self._build_sample_url()
        bill = self._build_sample_bill()
        minutes = self._build_sample_minutes()

        from_ids = [url.id, url.id]
        to_ids = [bill.id, minutes.id]

        data = client.bulk_link(from_ids, to_ids)
        assert data['op0']['from']['id'] == url.id
        assert data['op0']['to']['id'] == bill.id
        assert data['op1']['from']['id'] == url.id
        assert data['op1']['to']['id'] == minutes.id
        assert url.id in map(lambda x: x.id, client.get(bill.id).urls)
        assert url.id in map(lambda x: x.id, client.get(minutes.id).urls)

        data = client.bulk_unlink(from_ids, to_ids)
        assert data['op0']['from']['id'] == url.id
        assert data['op0']['to']['id'] == bill.id
        assert data['op1']['from']['id'] == url.id
        assert data['op1']['to']['id'] == minutes.id
        assert url.id not in map(lambda x: x.id, client.get(bill.id).urls)
        assert url.id not in map(lambda x: x.id, client.get(minutes.id).urls)
예제 #4
0
def main():
    gql_client = GraphQLClient()
    bill_list = gql_client.get_all_bills(['id'] + BILL_DATE_FIELDS)
    LOGGER.info(f'fetched {len(bill_list)} bills')
    minutes_list = gql_client.get_all_minutes(['id'] + MINUTES_DATE_FIELD)
    LOGGER.info(f'fetched {len(minutes_list)} minutes')
    news_list = gql_client.get_all_news(['id', 'is_timeline'] +
                                        NEWS_DATE_FIELD,
                                        start_date=args.start_date,
                                        end_date=args.end_date)
    LOGGER.info(f'fetched {len(news_list)} news')
    date2bill = build_date_dict(bill_list, BILL_DATE_FIELDS)
    date2minutes = build_date_dict(minutes_list, MINUTES_DATE_FIELD)
    date2news = build_date_dict(news_list, NEWS_DATE_FIELD)

    dates = [
        args.start_date + timedelta(i)
        for i in range((args.end_date - args.start_date).days)
    ]
    for date in tqdm(dates):
        timeline = Timeline(None)
        timeline.date = _Neo4jDateTimeInput(year=date.year,
                                            month=date.month,
                                            day=date.day)
        timeline.id = idgen(timeline)
        gql_client.merge(timeline)

        from_ids = []
        for bill in date2bill[date]:
            from_ids.append(bill.id)
        for minutes in date2minutes[date]:
            from_ids.append(minutes.id)
        for news in date2news[date]:
            if news.is_timeline:
                from_ids.append(news.id)
        gql_client.bulk_link(from_ids, [timeline.id] * len(from_ids))
        LOGGER.info(f'linked {len(from_ids)} events to {date}')
예제 #5
0
class SpiderTemplate(scrapy.Spider):
    domain = NotImplemented

    def __init__(self, *args, **kwargs):
        super(SpiderTemplate, self).__init__(*args, **kwargs)
        logging.getLogger('elasticsearch').setLevel(logging.WARNING)
        logging.getLogger('sgqlc').setLevel(logging.WARNING)
        self.gql_client = GraphQLClient()
        self.es_client = ElasticsearchClient()
        self.bill_finder = BillFinder()
        self.minutes_finder = MinutesFinder()
        self.committee_finder = CommitteeFinder()
        self.member_finder = MemberFinder()

    def parse(self, response):
        NotImplemented

    def link_urls(self, urls):
        """
        link Url to parent resource
        """

        from_ids, to_ids = [], []
        for url in urls:
            if hasattr(url, 'to_id'):
                from_ids.append(url.id)
                to_ids.append(url.to_id)
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def link_activities(self, activities):
        """
        link Activity to Member, Bill, and Minutes
        """

        from_ids, to_ids = [], []
        for activity in activities:
            for id_field in ['member_id', 'bill_id', 'minutes_id']:
                if hasattr(activity, id_field):
                    from_ids.append(activity.id)
                    to_ids.append(getattr(activity, id_field))
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def link_bill_action(self, bill_action_lst):
        """
        link BillAction to Bill, Minutes, and Speech
        """

        from_ids, to_ids = [], []
        for bill_action in bill_action_lst:
            for id_field in ['bill_id', 'minutes_id', 'speech_id']:
                if hasattr(bill_action, id_field):
                    from_ids.append(bill_action.id)
                    to_ids.append(getattr(bill_action, id_field))
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def link_minutes(self, minutes):
        """
        link Minutes to Bill, Member, and Committee
        """

        if hasattr(minutes, 'topic_ids'):
            bill_ids = list(filter(lambda x: x, minutes.topic_ids))
            if bill_ids:
                self.gql_client.bulk_link([minutes.id] * len(bill_ids), bill_ids)
                LOGGER.info(f'linked {len(bill_ids)} bills to {minutes.id}')

        if hasattr(minutes, 'speaker_ids'):
            member_ids = list(filter(lambda x: x, minutes.speaker_ids))
            if member_ids:
                self.gql_client.bulk_link(member_ids, [minutes.id] * len(member_ids))
                LOGGER.info(f'linked {len(member_ids)} members to {minutes.id}')

        try:
            committee = self.committee_finder.find_one(minutes.name)
        except ValueError as e:
            LOGGER.warning(e)
        else:
            self.gql_client.link(minutes.id, committee.id)

    def link_speeches(self, speeches):
        from_ids, to_ids = [], []
        for speech in speeches:
            from_ids.append(speech.id)
            to_ids.append(speech.minutes_id)
            if hasattr(speech, 'member_id'):
                from_ids.append(speech.member_id)
                to_ids.append(speech.id)
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def delete_old_urls(self, src_id, url_title):
        obj = self.gql_client.get(src_id, fields=['urls'])
        for url in obj.urls:
            if url.title == url_title:
                self.gql_client.delete(url.id)
                LOGGER.info(f'deleted {url.id}')

    def get_diet(self, diet_number=None):
        if diet_number:
            return self.gql_client.get(f'Diet:{diet_number}', ['id', 'number', 'start_date'])
        else:
            return self.get_latest_diet()

    def get_latest_diet(self):
        diets = sorted(self.gql_client.get_all_diets(['id', 'number', 'start_date']), key=lambda x: x.number)
        return diets[-1]

    def get_topic_ids(self, topics):
        def get_topic_id(topic):
            maybe_bill_number = extract_bill_number_or_none(topic)
            maybe_category = extract_bill_category_or_none(topic)
            try:
                if maybe_bill_number:
                    bill = self.bill_finder.find_one(maybe_bill_number)
                elif maybe_category:
                    bill = self.bill_finder.find_one(topic, category=maybe_category)
                else:
                    bill = self.bill_finder.find_one(topic)
                return bill.id
            except ValueError as e:
                LOGGER.debug(e)  # this is expected when topic does not include bill
            return ''

        return list(map(lambda x: get_topic_id(x), topics))

    def get_speakers_ids(self, speakers):
        def get_speaker_id(speaker):
            try:
                member = self.member_finder.find_one(speaker)
                return member.id
            except ValueError as e:
                LOGGER.debug(e)  # this is expected when speaker is not member
            return ''

        return list(map(lambda x: get_speaker_id(x), speakers))
예제 #6
0
class SpiderTemplate(scrapy.Spider):
    domain = NotImplemented

    def __init__(self, *args, **kwargs):
        super(SpiderTemplate, self).__init__(*args, **kwargs)
        logging.getLogger('elasticsearch').setLevel(logging.WARNING)
        logging.getLogger('sgqlc').setLevel(logging.WARNING)
        self.gql_client = GraphQLClient()
        self.es_client = ElasticsearchClient()
        self.bill_finder = BillFinder()
        self.minutes_finder = MinutesFinder()
        self.committee_finder = CommitteeFinder()
        self.member_finder = MemberFinder()

    def parse(self, response):
        NotImplemented

    def link_urls(self, urls):
        """
        link Url to parent resource
        """

        from_ids, to_ids = [], []
        for url in urls:
            if hasattr(url, 'to_id'):
                from_ids.append(url.id)
                to_ids.append(url.to_id)
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def link_activities(self, activities):
        """
        link Activity to Member, Bill, and Minutes
        """

        from_ids, to_ids = [], []
        for activity in activities:
            for id_field in ['member_id', 'bill_id', 'minutes_id']:
                if hasattr(activity, id_field):
                    from_ids.append(activity.id)
                    to_ids.append(getattr(activity, id_field))
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def link_minutes(self, minutes):
        """
        link Minutes to Bill, Committee and Member
        """

        self.link_bills_by_topics(minutes)

        try:
            committee = self.committee_finder.find_one(minutes.name)
        except ValueError as e:
            LOGGER.warning(e)
        else:
            self.gql_client.link(minutes.id, committee.id)

        if hasattr(minutes, 'speakers'):
            from_ids = []
            to_ids = []
            for speaker in minutes.speakers:
                try:
                    member = self.member_finder.find_one(speaker)
                except ValueError as e:
                    LOGGER.debug(e)  # this is expected when speaker is not member
                else:
                    from_ids.append(member.id)
                    to_ids.append(minutes.id)
            if from_ids:
                self.gql_client.bulk_link(from_ids, to_ids)

    def link_speeches(self, speeches):
        from_ids, to_ids = [], []
        for speech in speeches:
            from_ids.append(speech.id)
            to_ids.append(speech.minutes_id)
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def store_urls_for_bill(self, urls, bill_query):
        if not urls:
            return
        try:
            bill = self.bill_finder.find_one(bill_query)
        except ValueError as e:
            LOGGER.warning(e)
        else:
            self.gql_client.bulk_merge(urls)
            self.gql_client.bulk_link(map(lambda x: x.id, urls), [bill.id] * len(urls))

    def delete_old_urls(self, src_id, url_title):
        obj = self.gql_client.get(src_id)
        for url in obj.urls:
            if url.title == url_title:
                self.gql_client.delete(url.id)
                LOGGER.info(f'deleted {url.id}')

    def link_bills_by_topics(self, minutes: Minutes):
        if not hasattr(minutes, 'topics'):
            return

        from_ids, to_ids = [], []
        for topic in minutes.topics:
            try:
                bill = self.bill_finder.find_one(topic)
            except ValueError as e:
                LOGGER.debug(e)  # this is expected when topic does not include bill
            else:
                from_ids.append(minutes.id)
                to_ids.append(bill.id)
                LOGGER.debug(f'link {minutes.id} to {bill.id}')
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)
            LOGGER.info(f'linked {len(from_ids)} bills to {minutes.id}')