def main(fp): client = GraphQLClient() member_finder = MemberFinder(search_fields=['name', 'name_hira']) df = pd.read_csv(fp).fillna('') LOGGER.info(f'load {len(df)} members from {fp}') members = [] for _, row in df.iterrows(): member = None for search_field in ['name', 'name_hira']: try: member = member_finder.find_one(row[search_field], exact_match=True) break except ValueError as e: LOGGER.debug(e) if not member: LOGGER.warning(f'failed to find member for row={row}') continue for link_field in ['website', 'twitter', 'facebook']: if row[link_field]: setattr(member, link_field, row[link_field]) members.append(member) client.bulk_merge(members) LOGGER.info(f'merged {len(members)} member links')
def main(fp): gql_client = GraphQLClient() df = pd.read_csv(fp) diets = [] for _, row in df.iterrows(): diet = Diet(None) diet.number = int(row['number']) diet.name = f'第{diet.number}回国会' diet.category = row['category'] diet.start_date = to_neo4j_datetime(row['start_date']) diet.end_date = to_neo4j_datetime(row['end_date']) diet.id = idgen(diet) diets.append(diet) gql_client.bulk_merge(diets) LOGGER.info(f'merged {len(diets)} diets')
def test_bulk_merge(self): client = GraphQLClient() bill = self._build_sample_bill() url = self._build_sample_url() data = client.bulk_merge([bill, url]) assert data['op0']['id'] == bill.id assert data['op1']['id'] == url.id
class SpiderTemplate(scrapy.Spider): domain = NotImplemented def __init__(self, *args, **kwargs): super(SpiderTemplate, self).__init__(*args, **kwargs) logging.getLogger('elasticsearch').setLevel(logging.WARNING) logging.getLogger('sgqlc').setLevel(logging.WARNING) self.gql_client = GraphQLClient() self.es_client = ElasticsearchClient() self.bill_finder = BillFinder() self.minutes_finder = MinutesFinder() self.committee_finder = CommitteeFinder() self.member_finder = MemberFinder() def parse(self, response): NotImplemented def link_urls(self, urls): """ link Url to parent resource """ from_ids, to_ids = [], [] for url in urls: if hasattr(url, 'to_id'): from_ids.append(url.id) to_ids.append(url.to_id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_activities(self, activities): """ link Activity to Member, Bill, and Minutes """ from_ids, to_ids = [], [] for activity in activities: for id_field in ['member_id', 'bill_id', 'minutes_id']: if hasattr(activity, id_field): from_ids.append(activity.id) to_ids.append(getattr(activity, id_field)) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_minutes(self, minutes): """ link Minutes to Bill, Committee and Member """ self.link_bills_by_topics(minutes) try: committee = self.committee_finder.find_one(minutes.name) except ValueError as e: LOGGER.warning(e) else: self.gql_client.link(minutes.id, committee.id) if hasattr(minutes, 'speakers'): from_ids = [] to_ids = [] for speaker in minutes.speakers: try: member = self.member_finder.find_one(speaker) except ValueError as e: LOGGER.debug(e) # this is expected when speaker is not member else: from_ids.append(member.id) to_ids.append(minutes.id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_speeches(self, speeches): from_ids, to_ids = [], [] for speech in speeches: from_ids.append(speech.id) to_ids.append(speech.minutes_id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def store_urls_for_bill(self, urls, bill_query): if not urls: return try: bill = self.bill_finder.find_one(bill_query) except ValueError as e: LOGGER.warning(e) else: self.gql_client.bulk_merge(urls) self.gql_client.bulk_link(map(lambda x: x.id, urls), [bill.id] * len(urls)) def delete_old_urls(self, src_id, url_title): obj = self.gql_client.get(src_id) for url in obj.urls: if url.title == url_title: self.gql_client.delete(url.id) LOGGER.info(f'deleted {url.id}') def link_bills_by_topics(self, minutes: Minutes): if not hasattr(minutes, 'topics'): return from_ids, to_ids = [], [] for topic in minutes.topics: try: bill = self.bill_finder.find_one(topic) except ValueError as e: LOGGER.debug(e) # this is expected when topic does not include bill else: from_ids.append(minutes.id) to_ids.append(bill.id) LOGGER.debug(f'link {minutes.id} to {bill.id}') if from_ids: self.gql_client.bulk_link(from_ids, to_ids) LOGGER.info(f'linked {len(from_ids)} bills to {minutes.id}')