def _update_10_rows_from_index(i): records = [{'id': row['airtableId'], 'fields': {'Synced time input': row['Synced time input']}} for row in rows[i: i + 10]] res = update_airtable_rows( SCIENCE_FEEDBACK_AIRTABLE_BASE_ID, NAME_TO_AIRTABLE[name], {'records': records}, session=session ) if res.status_code != 200: logger.error(f'code: {res.status_code}, error: {res.content}')
def shares_from_url(url, request_start_date): params = { 'count': 1000, # 'includeHistory': 'true', 'link': url, 'platforms': 'facebook', 'sortBy': 'total_interactions', 'startDate': request_start_date, 'token': CROWDTANGLE_API_KEY } api_endpoint = 'links' response = requests.get('{}/{}'.format(CROWDTANGLE_API_URL, api_endpoint), params).json() shares = [] if response['status'] == 200: if not response.get('result'): logger.warning('Crowdtangle data returned is empty.') return shares for post in response['result']['posts']: account = post['account'] shares.append({ 'account': { 'crowdtangleIdentifier': str(account['id']), 'facebookIdentifier': str(account['platformId']), 'logoUrl': account['profileImage'], 'name': account['name'], 'url': account['url'] }, 'post': { 'crowdtangleIdentifier': str(post['id']), 'facebookIdentifier': str(post['platformId']), 'publishedDate': datetime.strptime(post['date'], '%Y-%m-%d %H:%M:%S'), 'url': post['postUrl'], } }) else: logger.error( f'Error in fetching from Crowdtangle: {response.get("message", "Unknown exception.")}' ) logger.warning('Returning empty interaction data') sleep(30) return shares
def create_entity_from_row(entity_name): try: entity = entity_from_row_for( entity_name, request.json, request.json.get('index', request.json.get('airtableId'))) if entity: ApiHandler.save(entity) if entity.__class__ == Appearance: sync_content(entity.quotingContent) return jsonify(as_dict(entity)), 200 else: return jsonify({"error": "couldn't save the entity"}), 500 except Exception as e: logger.error(e) return jsonify({"exception": "couldn't complete your request"}), 500
def read_thumb(files=None, form=None): if 'thumb' in files: thumb = files['thumb'] filename_parts = thumb.filename.rsplit('.', 1) if len(filename_parts) < 2 \ or filename_parts[1].lower() not in ALLOWED_EXTENSIONS: raise ApiErrors({ 'thumb': [ f"Cette image manque d'une extension {READABLE_EXTENSIONS} ou son format n'est pas autorisé" ] }) return thumb.read() if 'thumbUrl' in form: try: return _fetch_image(form['thumbUrl']) except ValueError as e: logger.error(e) raise ApiErrors( {'thumbUrl': ["Th L'adresse saisie n'est pas valide"]})
def _fetch_image(thumb_url: str) -> bytes: if not thumb_url[0:4] == 'http': raise ValueError('Invalid thumb URL : %s' % thumb_url) try: response = requests.get(thumb_url) except Exception as e: logger.error(e) raise ApiErrors({ 'thumbUrl': ["Impossible de télécharger l'image à cette adresse"] }) content_type = response.headers['Content-type'] is_an_image = content_type.split('/')[0] == 'image' if response.status_code == 200 and is_an_image: return response.content else: raise ValueError( 'Error downloading thumb from url %s (status_code : %s)' % (thumb_url, str(response.status_code)))
def create_wayback_machine_url(url, sleep_time=2): logger.info('Saving {} to Wayback Machine...'.format(url)) with requests.Session() as session: session.headers = { 'Connection': 'keep-alive', 'host': urlparse(BASE_URL).hostname, 'User-Agent': 'Science Feedback (https://sciencefeedback.co)' } session.allow_redirects = True session.timeout = 120 res = session.get('{}{}'.format(SAVE_URL, url)) # wait time to ensure the page is saved sleep(sleep_time) if res.status_code == 200: logger.info('Saving {} to Wayback Machine...Done.'.format(url)) location = res.headers['Content-Location'] return '{}{}'.format(BASE_URL, location) else: logger.error('Saving {} to Wayback Machine...ERROR: {}'.format( url, res.status_code)) return None
def claim_verdicts_from_airtable(verdicts_to_sync=None, max_verdicts=None, sync_async=False): if verdicts_to_sync is None: query = Verdict.query.filter(Verdict.scienceFeedbackUrl != None) if max_verdicts is not None: query = query.limit(max_verdicts) verdicts = query.all() else: verdicts = verdicts_to_sync if max_verdicts is not None: max_verdicts = len(verdicts) urls = [verdict.scienceFeedbackUrl for verdict in verdicts][:max_verdicts] if sync_async: claim_reviews = map_asynchronous(claim_review_from_url, urls) else: claim_reviews = [claim_review_from_url(url) for url in urls] for (index, verdict) in enumerate(verdicts): claim_review = claim_reviews[index] if not claim_review: continue for conclusion in claim_review['conclusions']: try: tag = Tag.create_or_modify({ '__SEARCH_BY__': ['label', 'type'], 'label': conclusion, 'type': TagType.CONCLUSION }) if tag.id is None: logger.info('Saving tag {}'.format(as_dict(tag))) ApiHandler.save(tag) verdict_tag = VerdictTag.create_or_modify({ '__SEARCH_BY__': ['tagId', 'verdictId'], 'tagId': humanize(tag.id), 'verdictId': humanize(verdict.id) }) verdict.verdictTags = verdict.verdictTags + [verdict_tag] except IntegrityError as e: logger.error('IntegrityError: {}, Conclusion: {}'.format( e, conclusion)) except InvalidRequestError as e: logger.error('InvalidRequestError: {}, Conclusion: {}'.format( e, conclusion)) except NotNullViolation as violation: logger.error('NotNullViolation: {}, Conclusion: {}'.format( violation, conclusion)) return verdicts
def sync_for( name, formula=None, max_records=None, session=None, sync_to_airtable=False ): if session is None: session = requests.Session() rows = request_airtable_rows( SCIENCE_FEEDBACK_AIRTABLE_BASE_ID, NAME_TO_AIRTABLE[name], filter_by_formula=formula, max_records=max_records, session=session ) entities = [] if rows: logger.info(f'syncing table {NAME_TO_AIRTABLE[name]}') else: logger.info(f'nothing to sync for table {NAME_TO_AIRTABLE[name]}') for (index, row) in enumerate(rows): try: entity = entity_from_row_for(name, row, index) if entity: entities.append(entity) row['Synced time input'] = datetime.now().isoformat() else: row['Synced time input'] = 'ERROR' except KeyError as exception: logger.warning(f'Error while trying to create entity from row at table {NAME_TO_AIRTABLE[name]}') logger.error(f'KeyError {exception}: {row}') row['Synced time input'] = 'ERROR' except Exception as exception: logger.warning(f'Error while trying to create entity from row at table {NAME_TO_AIRTABLE[name]}') logger.error(f'Unexpected error: {exception} - {sys.exc_info()[0]} at {row}') row['Synced time input'] = 'ERROR' def _update_10_rows_from_index(i): records = [{'id': row['airtableId'], 'fields': {'Synced time input': row['Synced time input']}} for row in rows[i: i + 10]] res = update_airtable_rows( SCIENCE_FEEDBACK_AIRTABLE_BASE_ID, NAME_TO_AIRTABLE[name], {'records': records}, session=session ) if res.status_code != 200: logger.error(f'code: {res.status_code}, error: {res.content}') try: # Sync verdict status from wordpress if name == 'verdict' and formula is not None: entities = claim_verdicts_from_airtable(verdicts_to_sync=entities) # Sync related contents for appearances if name == 'appearance' and formula is not None: for entity in entities: sync_content(entity.quotingContent) # Set the time synced so that the status in airtable is "Synced" if sync_to_airtable: for i in range(0, len(rows), 10): try: ApiHandler.save(*entities[i:i + 10]) _update_10_rows_from_index(i) except Exception as exception: logger.warning(f'Error while trying to save 10 entities at table {NAME_TO_AIRTABLE[name]}') logger.error(f'Unexpected error: {exception} - {sys.exc_info()[0]}') for index in range(i, i + 10): rows[index]['Synced time input'] = 'BATCH ERROR' _update_10_rows_from_index(i) except Exception as exception: logger.warning(f'Error while trying to save entities at table {NAME_TO_AIRTABLE[name]}') logger.error(f'Unexpected error: {exception} - {sys.exc_info()[0]}')
def url_from_archiveis(url): save_url = '{}/submit/'.format(ARCHIVEIS_URL) headers = { 'User-Agent': 'Science Feedback (https://sciencefeedback.co)', 'host': urlparse(ARCHIVEIS_URL).hostname } get_kwargs = dict(allow_redirects=True, headers=headers, timeout=120) response = requests.get(ARCHIVEIS_URL + '/', **get_kwargs) response.raise_for_status() html = str(response.content) try: unique_id = html.split('name="submitid', 1)[1].split('value="', 1)[1].split('"', 1)[0] except IndexError as e: logger.error('Cannot find unique id: {}.'.format(e)) logger.info('Submitting without unique id.') unique_id = None data = { "url": url, "anyway": 1, } if unique_id is not None: data.update({'submitid': unique_id}) post_kwargs = dict(allow_redirects=True, headers=headers, data=data, timeout=120) logger.info('Archiving URL: {}'.format(url)) response = requests.post(save_url, **post_kwargs) response.raise_for_status() if 'Refresh' in response.headers: archive_url = str( response.headers['Refresh']).split(';url=')[1].replace('/wip', '') logger.info("archive_url from Refresh header: {}".format(archive_url)) return archive_url if 'Location' in response.headers: archive_url = response.headers['Location'] logger.info("archive_url from Location header: {}".format(archive_url)) return archive_url logger.info( "archive_url not found in response headers. Inspecting history.") for i, r in enumerate(response.history): logger.info("Inspecting history request #{}".format(i)) logger.info(r.headers) if 'Location' in r.headers: archive_url = r.headers['Location'] logger.info( "archive_url from the Location header of {} history response: {}" .format(i + 1, archive_url)) return archive_url logger.error("No archive_url returned by archive.vn") logger.error("Status code: {}".format(response.status_code)) logger.error(response.headers) logger.error(response.text) return None