Exemplo n.º 1
0
def collect_cards_internal(requester, board, board_members, checklists, lists, card_status):
    collected_cards = []
    last_card_id = None
    while True:
        filters = {'filter': 'all', 'fields': 'all', 'limit': '1000'}
        if last_card_id:
            # Trello api supports paging by using the id of the last card in the previous batch as 'before' parameter
            filters['before'] = last_card_id
        cards = board.get_cards(filters=filters, card_filter=card_status)
        for card in cards:
            db_card, created = Document.objects.get_or_create(
                trello_board_id=board.id,
                trello_card_id=card.id,
                requester=requester,
                user_id=requester.id
            )
            card_last_activity = card.raw.get('dateLastActivity')
            last_activity = parse_dt(card_last_activity).isoformat()
            last_activity_ts = int(parse_dt(card_last_activity).timestamp())
            collected_cards.append(card)
            if not created and db_card.last_updated_ts and db_card.last_updated_ts >= last_activity_ts:
                logger.debug("Trello card '%s' for user '%s' hasn't changed", card.name[:50], requester.username)
                continue
            logger.debug("Processing card '%s' for user '%s'", card.name[:50], requester.username)
            db_card.primary_keywords = TRELLO_PRIMARY_KEYWORDS
            db_card.secondary_keywords = TRELLO_SECONDARY_KEYWORDS['card']
            db_card.last_updated = last_activity
            db_card.last_updated_ts = last_activity_ts
            db_card.trello_title = 'Card: {}'.format(card.name)
            db_card.webview_link = card.url
            db_card.trello_content = {
                'description': _to_html(card.description),
                'checklists': [
                    {
                        'id': cl.id,
                        'name': cl.name,
                        'items': cl.items
                    }
                    for cl in checklists[card.id]
                ]
            }
            db_card.trello_card_status = 'Archived' if card.closed else 'Open'
            db_card.trello_card_members = [board_members.get(m) for m in card.idMembers if m in board_members]
            db_card.trello_board_name = board.name
            db_card.trello_list = lists.get(card.idList)
            db_card.last_synced = get_utc_timestamp()
            db_card.download_status = Document.READY
            db_card.save()
            algolia_engine.sync(db_card, add=created)
            last_card_id = card.id
        if len(cards) < 1000:
            break
    return collected_cards
Exemplo n.º 2
0
def collect_files(requester, repo_id, repo_name, repo_url, default_branch,
                  enrichment_delay):
    """
    List all files in a repo - should be called once, after first sync of a repo. Subsequent syncing is handled
    via collect_commits() function.

    Note that this uses Github's API call for retrieval of recursive trees:
      https://developer.github.com/v3/git/trees/#get-a-tree-recursively
    This API call returns a flat list of all files and saves us many API calls that would be needed
    to recursively fetch files for each repo directory. But it may not work well for very big repos
    (> 5k files), becuase Github API has a limit of number of elements it will return in one call.
    """
    github_client = init_github_client(requester)
    repo = github_client.get_repo(full_name_or_id=repo_name)
    new_files = []
    for f in repo.get_git_tree(sha=repo.default_branch, recursive=True).tree:
        db_file, created = Document.objects.get_or_create(
            github_file_id=_compute_sha('{}{}'.format(repo_id, f.path)),
            github_repo_id=repo_id,
            requester=requester,
            user_id=requester.id)
        if created:
            new_files.append({
                'sha': f.sha,
                'filename': f.path,
                'action': 'modified',
                'type': f.type
            })
            db_file.primary_keywords = GITHUB_PRIMARY_KEYWORDS
            db_file.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['file']
            # set the timestamp to 0 (epoch) to signal that we don't know the update timestamp
            db_file.last_updated_ts = 0
            db_file.last_updated = datetime.utcfromtimestamp(
                0).isoformat() + 'Z'
            db_file.github_title = '{}: {}'.format(
                'Dir' if f.type == 'tree' else 'File',
                f.path.split('/')[-1])
            db_file.github_file_path = f.path
            db_file.github_repo_full_name = repo_name
            db_file.webview_link = '{}/blob/{}/{}'.format(
                repo_url, default_branch, f.path)
            algolia_engine.sync(db_file, add=created)
        db_file.last_synced = get_utc_timestamp()
        db_file.download_status = Document.PENDING
        db_file.save()
    # run enrich_files() for all new_files in chunks of 50 items
    i = 0
    for ff in [new_files[x:x + 50] for x in range(0, len(new_files), 50)]:
        i = i + 1
        subtask(enrich_files).apply_async(
            args=[requester, ff, repo.id, repo_name, repo_url, default_branch],
            countdown=enrichment_delay + (240 * i))
Exemplo n.º 3
0
def collect_deals(requester):
    pipe_client = init_pipedrive_client(requester)
    stages = {s.id: s.name for s in pipe_client.Stage.fetch_all()}
    users = {u.id: u for u in pipe_client.User.fetch_all()}
    # fallback domain
    org_domain = None

    for deal in pipe_client.Deal.fetch_all():
        if deal.org_id:
            org_domain = deal.org_id.get('cc_email', '').split('@')[0]
        if not org_domain:
            # cannot associate a deal to a company
            logger.debug("Deal '%s' for user '%s' cannot be associated to a company", deal.title, requester.username)
            continue
        db_deal, created = Document.objects.get_or_create(
            pipedrive_deal_id=deal.id,
            requester=requester,
            user_id=requester.id
        )
        if not created and db_deal.last_updated_ts:
            # compare timestamps and skip the deal if it hasn't been updated
            if db_deal.last_updated_ts >= parse_dt(deal.update_time).timestamp():
                logger.debug("Deal '%s' for user '%s' hasn't changed", deal.title, requester.username)
                continue

        db_deal.primary_keywords = PIPEDRIVE_KEYWORDS['primary']
        db_deal.secondary_keywords = PIPEDRIVE_KEYWORDS['secondary']
        db_deal.pipedrive_title = deal.title
        logger.debug("Processing deal '%s' for user '%s'", deal.title, requester.username)
        db_deal.pipedrive_deal_company = deal.org_id.get('name') if deal.org_id else None
        db_deal.pipedrive_deal_value = deal.value
        db_deal.pipedrive_deal_currency = deal.currency
        db_deal.pipedrive_deal_status = deal.status
        db_deal.pipedrive_deal_stage = stages.get(deal.stage_id)
        db_deal.webview_link = 'https://{}.pipedrive.com/deal/{}'.format(org_domain, deal.id)
        db_deal.last_updated = parse_dt(deal.update_time).isoformat() + 'Z'
        db_deal.last_updated_ts = parse_dt(deal.update_time).timestamp()
        db_deal.pipedrive_content = build_deal_content(deal, users, org_domain, pipe_client)
        db_deal.last_synced = get_utc_timestamp()
        db_deal.download_status = Document.READY
        db_deal.save()
        algolia_engine.sync(db_deal, add=created)
        # add sleep of one second to avoid breaking API rate limits
        time.sleep(1)
Exemplo n.º 4
0
def download_gdrive_document(doc, access_token, refresh_token):
    doc.download_status = Document.PROCESSING
    doc.save()

    try:
        service = connect_to_gdrive(access_token, refresh_token)

        request = None
        if doc.mime_type.startswith('application/vnd.google-apps.'):
            export_mime = 'text/csv' if 'spreadsheet' in doc.mime_type else 'text/plain'
            request = service.files().export_media(fileId=doc.document_id, mimeType=export_mime)
        else:
            request = service.files().get_media(fileId=doc.document_id)
        response = request.execute()
        logger.info("Done downloading {} [{}]".format(doc.title, doc.document_id))

        content = cut_utf_string(response.decode('UTF-8', errors='replace'), 9000, step=10)
        doc.content = content
        doc.last_synced = get_utc_timestamp()
        algolia_engine.sync(doc, add=False)
    finally:
        doc.download_status = Document.READY
        doc.save()
Exemplo n.º 5
0
def process_gdrive_docs(requester, access_token, refresh_token, files_fn, json_key):
    service = connect_to_gdrive(access_token, refresh_token)
    folders = {}

    page_token = None
    new_start_page_token = None
    while True:
        files = files_fn(service, page_token)
        new_start_page_token = files.get('newStartPageToken', new_start_page_token)
        items = files.get(json_key, [])
        if not folders and len(items) > 0:
            # retrieve all folders to be able to get file path more easily in the file listing(s)
            logger.debug("Getting folders for %s/%s", requester.id, requester.username)
            folders = get_gdrive_folders(service)
            # check if any folder was marked as hidden and we already have it synced ...
            # if we do, then remove it (plus all children) from our indexing
            for folder_id, folder in folders.items():
                if folder.get('hidden') is True:
                    desync_folder(folder.get('id'), folders, requester, service)

        for item in items:
            if 'file' in item:
                item = item['file']
            # check for ignored mime types
            if any(x.match(item.get('mimeType', '')) for x in IGNORED_MIMES):
                continue
            parents = item.get('parents', [])
            hidden = is_hidden(item.get('description')) or any(is_hidden_in_folder(f, folders) for f in parents)
            if item.get('trashed') or hidden:
                # file was removed or hidden
                Document.objects.filter(
                    document_id=item['id'],
                    requester=requester,
                    user_id=requester.id
                ).delete()
                continue

            # handle file path within gdrive
            parent = parents[0] if parents else None
            path = get_gdrive_path(parent, folders)

            doc, created = get_or_create(
                model=Document,
                document_id=item['id'],
                requester=requester,
                user_id=requester.id
            )
            doc.mime_type = item.get('mimeType').lower()
            doc.title = item.get('name')
            doc.webview_link = item.get('webViewLink')
            doc.icon_link = item.get('iconLink')
            doc.thumbnail_link = item.get('thumbnailLink')
            doc.last_updated = item.get('modifiedTime')
            doc.path = path
            last_modified_on_server = parse_date(doc.last_updated)
            doc.last_updated_ts = last_modified_on_server.timestamp()
            doc.modifier_display_name = item.get('lastModifyingUser', {}).get('displayName')
            doc.modifier_photo_link = item.get('lastModifyingUser', {}).get('photoLink')
            doc.owner_display_name = item['owners'][0]['displayName']
            doc.owner_photo_link = item.get('owners', [{}])[0].get('photoLink')
            doc.primary_keywords = GDRIVE_KEYWORDS['primary']
            doc.secondary_keywords = GDRIVE_KEYWORDS['secondary'][doc.mime_type] \
                if doc.mime_type in GDRIVE_KEYWORDS['secondary'] else None
            can_download = item.get('capabilities', {}).get('canDownload', True)
            if can_download:
                # check also the mime type as we only support some of them
                if not any(x for x in EXPORTABLE_MIMES if doc.mime_type.startswith(x)):
                    can_download = False
            if can_download:
                if not created:
                    if doc.download_status is Document.READY and can_download and \
                            (doc.last_synced is None or last_modified_on_server > doc.last_synced):
                        doc.download_status = Document.PENDING
                        subtask(download_gdrive_document).delay(doc, access_token, refresh_token)
                else:
                    algolia_engine.sync(doc, add=created)
                    subtask(download_gdrive_document).delay(doc, access_token, refresh_token)
            else:
                doc.download_status = Document.READY
                doc.last_synced = get_utc_timestamp()
                doc.save()
                algolia_engine.sync(doc, add=False)

            doc.save()

        page_token = files.get('nextPageToken')
        if not page_token:
            break
    return new_start_page_token
Exemplo n.º 6
0
def collect_issues(requester, sync_update=False):
    jira = init_jira_client(requester)

    for project in jira.projects():
        project_name = project.raw.get('name')
        project_key = project.raw.get('key')
        project_url = '{}/projects/{}'.format(project._options.get('server'),
                                              project_key)
        logger.debug("Processing Jira project %s for user %s", project_key,
                     requester.username)

        jql = 'project={}'.format(project_key)
        if sync_update:
            # only fetch those issues that were updated in the last day
            jql = "{} and updated > '-1d'".format(jql)
        jql = '{} order by key'.format(jql)
        i = 0
        old_i = -1
        while True:
            # manually page through results (using 'maxResults=None' should page automatically, but it doesn't work)
            if i == old_i:
                break
            old_i = i
            for issue in jira.search_issues(jql,
                                            startAt=i,
                                            maxResults=25,
                                            validate_query=False):
                i = i + 1
                db_issue, created = Document.objects.get_or_create(
                    jira_issue_key=issue.key,
                    requester=requester,
                    user_id=requester.id)
                logger.debug("Processing Jira issue %s for user %s", issue.key,
                             requester.username)
                updated = issue.fields.updated or issue.fields.created or get_utc_timestamp(
                )
                updated_ts = parse_dt(updated).timestamp()
                if not created and db_issue.last_updated_ts:
                    # compare timestamps and skip the deal if it hasn't been updated
                    if db_issue.last_updated_ts >= updated_ts:
                        logger.debug("Issue '%s' for user '%s' hasn't changed",
                                     issue.key, requester.username)
                        continue
                i = i + 1
                db_issue.primary_keywords = JIRA_KEYWORDS['primary']
                db_issue.secondary_keywords = JIRA_KEYWORDS['secondary']
                db_issue.last_updated = updated
                db_issue.last_updated_ts = updated_ts
                db_issue.webview_link = '{}/browse/{}'.format(
                    project._options.get('server'), issue.key)
                db_issue.jira_issue_title = '{}: {}'.format(
                    issue.key, issue.fields.summary)
                db_issue.jira_issue_status = issue.fields.status.name
                db_issue.jira_issue_type = issue.fields.issuetype.name
                db_issue.jira_issue_priority = issue.fields.priority.name
                if issue.fields.description:
                    db_issue.jira_issue_description = cut_utf_string(
                        issue.fields.description, 9000, 100)
                db_issue.jira_issue_duedate = issue.fields.duedate
                db_issue.jira_issue_labels = issue.fields.labels
                db_issue.jira_issue_assignee = {
                    'name': issue.fields.assignee.displayName,
                    'avatar': issue.fields.assignee.raw.get('avatarUrls', {})
                } if issue.fields.assignee else {}
                reporter = issue.fields.reporter or issue.fields.creator
                db_issue.jira_issue_reporter = {
                    'name': reporter.displayName,
                    'avatar': reporter.raw.get('avatarUrls', {})
                }
                db_issue.jira_project_name = project_name
                db_issue.jira_project_key = project_key
                db_issue.jira_project_link = project_url
                db_issue.last_synced = get_utc_timestamp()
                db_issue.download_status = Document.READY
                db_issue.save()
                algolia_engine.sync(db_issue, add=created)
            time.sleep(2)

        # add sleep of five seconds to avoid breaking API rate limits
        time.sleep(5)
Exemplo n.º 7
0
def collect_customers(requester, update):
    helpscout_client = init_helpscout_client(requester)
    if not helpscout_client:
        logger.warn("User is missing Helpscout API key", requester.username)
        return
    # cache all mailboxes and their folders
    mailboxes = {m.id: m.name for m in helpscout_client.mailboxes()}
    folders = {}
    for box in mailboxes:
        helpscout_client.clearstate()
        while True:
            box_folders = helpscout_client.folders(box)
            if not box_folders or box_folders.count < 1:
                break
            folders[box] = {f.id: f.name for f in box_folders}
    # cache users
    users = {}
    while True:
        helpscout_users = helpscout_client.users()
        if not helpscout_users or helpscout_users.count < 1:
            break
        for u in helpscout_users:
            users[u.id] = {
                'id': u.id,
                'name': u.fullname,
                'email': u.email,
                'avatar': u.photourl
            }

    since_iso = None
    if update:
        customer_ids = set()
        # check for new stuff since last 6 hours only
        since = get_utc_timestamp() - timedelta(hours=6)
        # this abomination is needed, because Helpscout API chokes on iso dates with milliseconds and/or timezones
        since_iso = since.isoformat().split('.')[0] + 'Z'
        for box in mailboxes:
            helpscout_client.clearstate()
            while True:
                cons = helpscout_client.conversations_for_mailbox(
                    mailbox_id=box, modifiedSince=since_iso)
                if not cons or cons.count < 1:
                    break
                for con in cons:
                    customer_ids.add(con.customer.get('id'))
        for cid in customer_ids:
            # process customer
            customer = helpscout_client.customer(customer_id=cid)
            _process_customer(requester, customer, mailboxes, folders, users)
            # add sleep to avoid breaking API rate limits
            time.sleep(2)

    while True:
        customers = helpscout_client.customers(
            modifiedSince=since_iso) if update else helpscout_client.customers(
            )
        if not customers or customers.count < 1:
            break
        for customer in customers:
            _process_customer(requester, customer, mailboxes, folders, users)
            # add sleep to avoid breaking API rate limits
            time.sleep(2)
Exemplo n.º 8
0
def process_customer(requester, db_customer, mailboxes, folders, users):
    helpscout_client = init_helpscout_client(requester)
    db_customer.download_status = Document.PROCESSING
    db_customer.save()

    last_conversation = {}
    conversation_emails = set()
    conversations = []
    for box_id, box_name in mailboxes.items():
        logger.debug(
            "Fetching Helpscout conversations for '%s' in mailbox '%s'",
            db_customer.helpscout_name, box_name)
        while True:
            box_conversations = helpscout_client.conversations_for_customer_by_mailbox(
                box_id, db_customer.helpscout_customer_id)
            if not box_conversations or box_conversations.count < 1:
                break
            for bc in box_conversations:
                conversation = {
                    'id': bc.id,
                    'number': '#{}'.format(bc.number),
                    'mailbox': box_name,
                    'mailbox_id': box_id,
                    'folder': folders.get(bc.folderid),
                    'status': bc.status,
                    'owner': format_person(bc.owner),
                    'customer': format_person(bc.customer),
                    'subject': bc.subject,
                    'tags': bc.tags
                }
                last_updated = next(
                    (getattr(bc, x)
                     for x in ['usermodifiedat', 'modifiedat', 'createdat']
                     if hasattr(bc, x)), None)
                conversation['last_updated'] = last_updated
                if last_updated:
                    conversation['last_updated_ts'] = parse_dt(
                        last_updated).timestamp()
                conversations.append(conversation)
                if bc.customer:
                    conversation_emails = conversation_emails.union(
                        bc.customer.get('emails') or [])
                if last_updated and \
                        conversation.get('last_updated_ts', 0) > last_conversation.get('last_updated_ts', 0):
                    last_conversation = conversation
        # add sleep of three seconds to avoid breaking API rate limits
        time.sleep(3)
        helpscout_client.clearstate()

    if db_customer.last_updated_ts >= last_conversation.get(
            'last_updated_ts', 0):
        logger.info(
            "Helpscout customer '%s' for user '%s' seems unchanged, skipping further processing",
            db_customer.helpscout_name, requester.username)
        db_customer.download_status = Document.READY
        db_customer.save()
        return

    db_customer.last_updated = last_conversation.get('last_updated')
    db_customer.last_updated_ts = last_conversation.get('last_updated_ts')
    db_customer.helpscout_mailbox = last_conversation.get('mailbox')
    db_customer.helpscout_mailbox_id = last_conversation.get('mailbox_id')
    db_customer.helpscout_folder = last_conversation.get('folder')
    db_customer.helpscout_status = last_conversation.get('status')
    db_customer.helpscout_assigned = last_conversation.get('owner') is not None
    if conversation_emails:
        db_customer.helpscout_emails = ', '.join(
            filter(None, conversation_emails))

    # build helpscout content
    content = process_conversations(users, conversations, helpscout_client)
    db_customer.helpscout_content = content
    db_customer.download_status = Document.READY
    db_customer.last_synced = get_utc_timestamp()
    db_customer.save()
    algolia_engine.sync(db_customer, add=False)
Exemplo n.º 9
0
def collect_boards(requester):
    trello_client = init_trello_client(requester)
    orgs = dict()

    for board in trello_client.list_boards(board_filter='open,closed'):
        db_board, created = Document.objects.get_or_create(
            trello_board_id=board.id,
            trello_card_id__isnull=True,
            requester=requester,
            user_id=requester.id
        )
        board_last_activity = board.raw.get('dateLastActivity')
        if not board_last_activity:
            # this nasty hack is needed, becuse some Trello boards don't have 'dateLastActivity' timestamp
            # -> looks like it's those boards that have been inactive for some time
            if not created:
                board_last_activity = db_board.last_updated.isoformat()
            else:
                # Trello was established in 2011, so we use 01.01.2011 as epoch
                actions = board.fetch_actions(action_filter='all', action_limit=1, since='2011-01-01T00:00:00.000Z')
                if actions:
                    board_last_activity = actions[0].get('date')

        last_activity = parse_dt(board_last_activity).isoformat()
        last_activity_ts = int(parse_dt(board_last_activity).timestamp())
        if not created and db_board.download_status == Document.READY and \
                (db_board.last_updated_ts and db_board.last_updated_ts >= last_activity_ts):
            logger.debug("Trello board '%s' for user '%s' hasn't changed", board.name[:50], requester.username)
            continue
        logger.debug("Processing board '%s' for user '%s'", board.name[:50], requester.username)
        db_board.primary_keywords = TRELLO_PRIMARY_KEYWORDS
        db_board.secondary_keywords = TRELLO_SECONDARY_KEYWORDS['board']
        db_board.last_updated = last_activity
        db_board.last_updated_ts = last_activity_ts
        db_board.trello_title = 'Board: {}'.format(board.name)
        db_board.webview_link = board.url
        db_board._trello_description = board.description
        db_board.trello_board_status = 'Closed' if board.closed else 'Open'

        orgId = board.raw.get('idOrganization')
        if orgId and orgId not in orgs:
            try:
                org = trello_client.get_organization(orgId).raw
                orgs[orgId] = {
                    'name': org.get('displayName'),
                    'logo': 'https://trello-logos.s3.amazonaws.com/{}/30.png'.format(orgId),
                    'url': org.get('url')
                }
            except ResourceUnavailable:
                # defunct/deleted organization, assume that board is personal
                orgId = None
        db_board.trello_board_org = orgs[orgId] if orgId else None

        build_list = lambda l: {
            'id': l.id,
            'name': l.name,
            'closed': l.closed,
            'pos': l.pos
        }
        all_lists = {l.id: build_list(l) for l in board.all_lists()}
        db_board.trello_content = {
            'description': _to_html(board.description),
            'lists': sorted(
                filter(lambda x: not x.get('closed'), all_lists.values()),
                key=itemgetter('pos')
            )
        }

        build_member = lambda m: {
            'name': m.full_name,
            'url': m.url,
            'avatar': 'https://trello-avatars.s3.amazonaws.com/{}/30.png'.format(m.avatar_hash)
        }
        all_members = {m.id: build_member(m) for m in board.all_members()}
        db_board.trello_board_members = list(all_members.values())

        db_board.last_synced = get_utc_timestamp()
        db_board.download_status = Document.READY
        db_board.save()
        algolia_engine.sync(db_board, add=created)
        subtask(collect_cards).delay(requester, db_board, board.name, all_members, all_lists)
        # add sleep of 30s to avoid breaking api limits
        time.sleep(30)
Exemplo n.º 10
0
def collect_repos(requester):
    github_client = init_github_client(requester)
    # simple check if we are approaching api rate limits
    if github_client.rate_limiting[0] < 500:
        logger.debug(
            "Skipping github repos sync for user '%s' due to rate limits",
            requester.username)
        return

    i = 0
    for repo in github_client.get_user().get_repos():
        if not (repo.id or repo.full_name):
            logger.debug("Skipping github repo '%s' for user '%s'",
                         repo.full_name, requester.username)
            # seems like broken data, skip it
            continue
        if repo.fork:
            # don't process forked repos
            logger.debug("Skipping forked github repo '%s' for user '%s'",
                         repo.full_name, requester.username)
            continue

        db_repo, created = Document.objects.get_or_create(
            github_repo_id=repo.id,
            github_commit_id__isnull=True,
            github_file_id__isnull=True,
            github_issue_id__isnull=True,
            requester=requester,
            user_id=requester.id)
        db_repo.primary_keywords = GITHUB_PRIMARY_KEYWORDS
        db_repo.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['repo']
        db_repo.github_title = 'Repo: {}'.format(repo.name)
        db_repo.github_repo_owner = repo.owner.login
        db_repo.github_repo_description = repo.description
        logger.debug("Processing github repo '%s' for user '%s'",
                     repo.full_name, requester.username)
        commit_count = 0
        contributors = []
        try:
            # fetch contributors
            for cnt in repo.get_contributors():
                commit_count = commit_count + cnt.contributions
                if len(contributors) <= 10:
                    contributors.append({
                        'name': cnt.name,
                        'url': cnt.html_url,
                        'avatar': cnt.avatar_url
                    })
        except UnknownObjectException:
            # most probably, this repo is disabled
            if created:
                logger.debug("Removing github repo '%s' for user '%s'",
                             repo.full_name, requester.username)
                db_repo.delete()
            continue
        db_repo.github_repo_commit_count = commit_count
        db_repo.github_repo_contributors = contributors
        db_repo.github_repo_full_name = repo.full_name
        new_timestamp = max(repo.updated_at, repo.pushed_at)
        if created or new_timestamp.timestamp() > (db_repo.last_updated_ts
                                                   or 0):
            i = i + 1
            db_repo.last_updated_ts = new_timestamp.timestamp()
            db_repo.last_updated = new_timestamp.isoformat() + 'Z'
            db_repo.webview_link = repo.html_url
            # fetch readme file
            try:
                readme = repo.get_readme()
                readme_content = cut_utf_string(readme.decoded_content.decode(
                    'UTF-8', errors='replace'),
                                                9000,
                                                step=100)
                md = github_client.render_markdown(text=readme_content).decode(
                    'UTF-8', errors='replace')
                # also replace <em> tags, because they are used by Algolia highlighting
                db_repo.github_repo_content = md.replace('<em>',
                                                         '<b>').replace(
                                                             '</em>', '</b>')
                db_repo.github_repo_readme = readme.name
            except UnknownObjectException:
                # readme does not exist
                db_repo.github_repo_content = None
            algolia_engine.sync(db_repo, add=created)
            if created:
                # sync files
                subtask(collect_files).delay(requester,
                                             repo.id,
                                             repo.full_name,
                                             repo.html_url,
                                             repo.default_branch,
                                             enrichment_delay=i * 300)
        # sync commits
        subtask(collect_commits).apply_async(args=[
            requester, repo.id, repo.full_name, repo.html_url,
            repo.default_branch, commit_count
        ],
                                             countdown=240 *
                                             i if created else 1)
        # sync issues
        subtask(collect_issues).apply_async(
            args=[requester, repo.id, repo.full_name, created],
            countdown=180 * i if created else 1)

        db_repo.last_synced = get_utc_timestamp()
        db_repo.download_status = Document.READY
        db_repo.save()
Exemplo n.º 11
0
def collect_commits(requester, repo_id, repo_name, repo_url, default_branch,
                    commit_count):
    """
    Sync repository commits - up to the last commit that we've already synced or
    max 200 recent commits (whichever comes first).
    This is possible to do, because Github api returns commits
    sorted by commit timestamp and that old commits don't change
    (at least should not in a normally run repository).
    """
    max_commits = 200
    was_synced = Document.objects.filter(
        user_id=requester.id,
        github_repo_id=repo_id,
        github_commit_id__isnull=False).count() >= min(commit_count,
                                                       max_commits)
    github_client = init_github_client(requester,
                                       per_page=20 if was_synced else 100)
    # simple check if we are approaching api rate limits
    if github_client.rate_limiting[0] < 500:
        logger.debug(
            "Skipping github commits sync for user '%s' due to rate limits",
            requester.username)
        return

    i = 0
    for cmt in github_client.get_repo(full_name_or_id=repo_name).get_commits():
        if i >= max_commits:
            break
        i = i + 1
        db_commit, created = get_or_create(model=Document,
                                           github_commit_id=cmt.sha,
                                           github_repo_id=repo_id,
                                           requester=requester,
                                           user_id=requester.id)
        if not created and was_synced:
            logger.debug(
                "Found already synced commit, skipping further commits syncing for user '%s' and repo '%s'",
                requester.username, repo_name)
            break
        logger.debug(
            "Processing github commit for user '%s' and repo '%s' with message: %s",
            requester.username, repo_name, cmt.commit.message[:30])
        db_commit.primary_keywords = GITHUB_PRIMARY_KEYWORDS
        db_commit.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['commit']
        db_commit.last_updated_ts = cmt.commit.committer.date.timestamp()
        db_commit.last_updated = cmt.commit.committer.date.isoformat() + 'Z'
        db_commit.webview_link = cmt.html_url
        db_commit.github_title = 'Commit: {}'.format(cmt.commit.message[:50])
        db_commit.github_commit_content = cmt.commit.message
        db_commit.github_repo_full_name = repo_name
        db_commit.github_commit_committer = {
            'name': cmt.commit.author.name,
        }
        if cmt.author:
            db_commit.github_commit_committer['url'] = cmt.author.html_url
            db_commit.github_commit_committer['avatar'] = cmt.author.avatar_url
        # get the changed/added/deleted files in this commit (up to 100 files)
        files = []
        for f in cmt.files:
            files.append({
                'sha': f.sha,
                'filename': f.filename,
                'url': f.blob_url,
                'additions': f.additions,
                'deletions': f.deletions,
                'action': f.status
            })
            if len(files) >= 100:
                break
        if was_synced and len(files) > 0:
            subtask(enrich_files).delay(requester, files, repo_id, repo_name,
                                        repo_url, default_branch)

        db_commit.github_commit_files = files
        algolia_engine.sync(db_commit, add=created)

        db_commit.last_synced = get_utc_timestamp()
        db_commit.download_status = Document.READY
        db_commit.save()
        # add sleep of half a second to avoid breaking API rate limits
        time.sleep(0.5)
Exemplo n.º 12
0
def enrich_files(requester, files, repo_id, repo_name, repo_url,
                 default_branch):
    """
    Fetch committers, update timestamp, etc. for files.
    """
    github_client = init_github_client(requester, per_page=50)
    # simple check if we are approaching api rate limits
    if github_client.rate_limiting[0] < 500:
        # reschedule after 10 minutes
        logger.debug(
            "Skipping github enrich files for user '%s' due to rate limits",
            requester.username)
        subtask(enrich_files).apply_async(args=[
            requester, files, repo_id, repo_name, repo_url, default_branch
        ],
                                          countdown=600)
        return

    repo = github_client.get_repo(full_name_or_id=repo_name)
    for f in files:
        db_file, created = Document.objects.get_or_create(
            github_file_id=_compute_sha('{}{}'.format(repo_id,
                                                      f.get('filename'))),
            github_repo_id=repo_id,
            requester=requester,
            user_id=requester.id)
        if f.get('action') == 'removed':
            db_file.delete()
            continue

        logger.debug("Enriching github file '%s' for repo '%s' and user '%s'",
                     f.get('filename'), repo_name, requester.username)
        db_file.primary_keywords = GITHUB_PRIMARY_KEYWORDS
        db_file.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['file']
        db_file.github_title = '{}: {}'.format(
            'Dir' if f.get('type') == 'tree' else 'File',
            f.get('filename').split('/')[-1])
        db_file.github_file_path = f.get('filename')
        db_file.github_repo_full_name = repo_name
        db_file.webview_link = '{}/blob/{}/{}'.format(repo_url, default_branch,
                                                      f.get('filename'))
        committers = []
        seen = set()
        ts_set = False
        for cmt in repo.get_commits(sha=default_branch,
                                    path=f.get('filename')):
            if not ts_set:
                db_file.last_updated_ts = cmt.commit.committer.date.timestamp()
                db_file.last_updated = cmt.commit.committer.date.isoformat(
                ) + 'Z'
                ts_set = True
            if cmt.commit.committer.name not in seen:
                c = {'name': cmt.commit.committer.name}
                if cmt.committer:
                    c['url'] = cmt.committer.html_url
                    c['avatar'] = cmt.committer.avatar_url
                committers.append(c)
                seen.add(cmt.commit.committer.name)
            if len(committers) >= 10:
                break
        db_file.github_file_committers = committers
        algolia_engine.sync(db_file, add=created)

        db_file.last_synced = get_utc_timestamp()
        db_file.download_status = Document.READY
        db_file.save()
        # add sleep to avoid breaking API rate limits
        time.sleep(2)
Exemplo n.º 13
0
def collect_issues(requester, repo_id, repo_name, created):
    """
    Fetch the issues for a 'repo_name'.
    Note that Github API considers Pull Requests as issues. Therefore, when iterating through
    repo's issues, we get pull requests as well. At the moment, we also treat PRs as issues.
    TODO: handle pull requests properly (changed files, commits in this PR, possibly diffs ...)
    """
    github_client = init_github_client(requester)
    # simple check if we are approaching api rate limits
    if github_client.rate_limiting[0] < 500:
        logger.debug(
            "Skipping github issues sync for user '%s' due to rate limits",
            requester.username)
        return

    repo = github_client.get_repo(full_name_or_id=repo_name)
    search_args = {'state': 'all', 'sort': 'updated'}
    if not created:
        # if we are processing already synced repo, then just look for newly updated issues
        search_args['since'] = datetime.now(timezone.utc) - timedelta(hours=6)

    i = 0
    for issue in repo.get_issues(**search_args):
        db_issue, created = Document.objects.get_or_create(
            github_issue_id=issue.id,
            github_repo_id=repo_id,
            requester=requester,
            user_id=requester.id)
        if not created and db_issue.last_updated_ts and db_issue.last_updated_ts >= issue.updated_at.timestamp(
        ):
            continue
        logger.debug("Processing github issue #%s for user '%s' and repo '%s'",
                     issue.number, requester.username, repo_name)
        db_issue.primary_keywords = GITHUB_PRIMARY_KEYWORDS
        db_issue.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['issue']
        db_issue.last_updated_ts = issue.updated_at.timestamp()
        db_issue.last_updated = issue.updated_at.isoformat() + 'Z'
        db_issue.webview_link = issue.html_url
        db_issue.github_title = '#{}: {}'.format(issue.number, issue.title)
        if '/pull/' in issue.html_url:
            # pull request
            db_issue.github_title = 'PR {}'.format(db_issue.github_title)
        comments = []
        if issue.comments > 0:
            for comment in issue.get_comments():
                comments.append({
                    'body': _to_html(comment.body),
                    'timestamp': comment.updated_at.timestamp(),
                    'author': {
                        'name': comment.user.login,
                        'avatar': comment.user.avatar_url,
                        'url': comment.user.html_url
                    }
                })
                # only list up to 20 comments
                if len(comments) >= 20:
                    break

        content = {'body': _to_html(issue.body), 'comments': comments}
        # take care of Algolia 10k limit
        while len(json.dumps(content).encode('UTF-8')) > 9000:
            if len(content['comments']) < 1:
                content['body'] = cut_utf_string(content['body'],
                                                 9000,
                                                 step=100)
                break
            content['comments'] = content['comments'][:-1]

        db_issue.github_issue_content = content
        db_issue.github_repo_full_name = repo_name
        db_issue.github_issue_state = issue.state
        db_issue.github_issue_labels = [x.name for x in issue.labels]
        db_issue.github_issue_reporter = {
            'name': issue.user.login,
            'avatar': issue.user.avatar_url,
            'url': issue.user.html_url
        }
        db_issue.github_issue_assignees = []
        for assignee in issue.assignees:
            db_issue.github_issue_assignees.append({
                'name': assignee.login,
                'avatar': assignee.avatar_url,
                'url': assignee.html_url
            })

        algolia_engine.sync(db_issue, add=created)
        db_issue.last_synced = get_utc_timestamp()
        db_issue.download_status = Document.READY
        db_issue.save()
        # add sleep every 50 issues to avoid breaking API rate limits
        i = i + 1
        if i % 50 == 0:
            time.sleep(20)