def scrape_ads_to_bucket(self, destination_bucket_dir, ad_list_url, count=None, min_word_count=None, max_word_count=None, date=None):
        # Ensure things are typed correctly
        count = int(count) if count else None
        min_word_count = int(min_word_count) if min_word_count else None
        max_word_count = int(max_word_count) if max_word_count else None
        if date:
            if type(date) == str:
                date = convert_to_date(date)

        # TODO : Add abstraction via generator function to allow counts > one page of ads

        # Grab the HTML source code
        result_page = requests.get(ad_list_url)
        result_soup = BeautifulSoup(result_page.text, 'html.parser')

        # Scrape all ad URLs from ad list page
        ad_urls = []
        for ad_element in result_soup.find_all('li', {'class':'result-row'}):

            # Check date to allow for filtering
            datetime_string = ad_element.find('time')['datetime']
            datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%d %H:%M")
            url = ad_element.find('a')['href']

            # If date is filtered, skip any non-matching dates
            if date and datetime_obj.date() != date.date():
                continue

            # Check if it's "nearby" meaning not in the location we want
            nearby_element = ad_element.find('span', {'class': 'nearby'})
            if nearby_element:
                continue

            # Note the URL
            ad_urls += [url]

        # Scrape the filtered URLs
        successful_uploads = 0
        bucket_paths = []
        for url in ad_urls:
            try:
                bucket_path = self.scrape_ad_to_bucket(
                    destination_bucket_dir=destination_bucket_dir,
                    ad_url=url,
                    min_word_count=min_word_count,
                    max_word_count=max_word_count
                )
                bucket_paths += [bucket_path]
                successful_uploads += 1

                if count and successful_uploads >= count:
                    break

            except Exception as e:
                print(e)

        return bucket_paths
예제 #2
0
def insert_gapfill(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False):
    '''insert gapfill into records'''
    status = True
    msg = ""
    try:
        dataset = read_json(abs_filename)
    except json.decoder.JSONDecodeError:
        msg = "[insert_gapfill] {} is corrupted. JSON Decode Error.".format(abs_filename)
        logger.critical(msg)    
        return (False, msg, 0)
    header = None
    nb_values = 0
    records = []
    db = connect()
    student = db.students.find_one({"student":student_id})
    try:
        group = student["group"]
    except KeyError:
        group = None
    for chap, v in dataset.items():
        if len(v["records"]) > 0:
            for record in v['records']:
                # chapter = get_lesson_from_stimulus(record["target"])
                record.update({
                        "classroom": classroom_id,
                        "student": student_id,
                        "group": group,
                        "word_nb": int(chap),
                        # "chapter": None,
                        "chapter": int(chap),
                        "subject":"letters",
                        "tag": record["target"],
                        "word": record["target"],
                        "value": record["target"],
                        "dataset": _dataset["dataset"],
                        "lesson": None
                        })
                record["day"] = convert_to_date(record["unixTime"]) 
                record["unixTime"] = convert_to_isodate(record["unixTime"])
                record["game"] = str(record["minigameId"]) 
                record["isClicked"] = int(record["isClicked"] == True)
                del record["minigameId"]
                del record["targetId"]
                record["elapsedTime"] = float(record["elapsedTime"])
                records.append(record)
                nb_values +=1
    try:
        db.records.insert_many(records)
    except Exception as e:
        status = False
        msg = "[insert_assessements()] failed to insert records for student {}. Error:{}".format(student_id, e)
    if nb_values == 0:
        status = False
        msg = "[insert_assessements()] . No records found for {}".format(student_id)
    return(status, msg, nb_values)
예제 #3
0
파일: app.py 프로젝트: kacperwnuk/Notepad
    def put(self):
        data = parser.parse_args()
        try:
            db_note = Note.query.filter_by(title=data['title']).first()
            db_note.description = data['description']
            db_note.is_markdown = data['isMarkdownFile']
            date = convert_to_date(data['date'])
            db_note.date = date if date != '' else db_note.date
            db_note.categories = self._get_categories(data['categories'])

            db.session.commit()
            return db_note.json()
        except (ValueError, DatabaseError) as e:
            db.session.rollback()
            return abort(400, message=f"Your request contains wrong data! {e}")
예제 #4
0
파일: app.py 프로젝트: kacperwnuk/Notepad
    def post(self):
        data = parser.parse_args()
        try:
            date = convert_to_date(data['date'])
            if date != '':
                note = Note(title=data['title'],
                            description=data['description'],
                            is_markdown=data['isMarkdownFile'],
                            date=date)
            else:
                note = Note(title=data['title'],
                            description=data['description'],
                            is_markdown=data['isMarkdownFile'])

            note.categories = self._get_categories(data['categories'])

            note.save()
            return note.json()
        except (ValueError, DatabaseError) as e:
            db.session.rollback()
            return abort(400, message=f"Your request contains wrong data! {e}")
예제 #5
0
 def __init__(self, page: int = 1, page_size: int = 5, category: str = "", start_date: str = "", end_date: str = ""):
     self.page = page
     self.page_size = page_size
     self.category = category
     self.start_date = convert_to_date(start_date)
     self.end_date = convert_to_date(end_date)
예제 #6
0
def poem_stitcher(cities=None,
                  urls=None,
                  dont_post_if_runtime_under=None,
                  min_length=None,
                  max_length=None,
                  date=None,
                  image_flavor=None,
                  all_of_day=False,
                  no_youtube_upload=False,
                  voice=None,
                  speaking_rate=None,
                  pitch=None,
                  upload_to_bucket_path=None):
    # Toss invalid combinations of args
    if all_of_day and (not date or not cities):
        raise BadOptionsError(
            'Must specify DATE and CITIES when using ALL_OF_DAY flag')

    # Ensure correct typing
    dont_post_if_runtime_under = float(
        dont_post_if_runtime_under) if dont_post_if_runtime_under else None
    date = convert_to_date(date) if type(
        date) == str else date if type(date) != None else None

    # TODO Hash args to create the dir
    destination_bucket_dir = f'{cities[0].replace(" ", "").lower()}-{str(date.date())}'

    print(cities, date)

    ##############
    # SCRAPE ADS #
    ##############

    # Form request list for scrapers
    scraper_request_list = []

    # NOTE: Handled cases: (all-of-day, date, city)
    if all_of_day:
        # Scrape each city for ads from DATE
        for city in cities:
            scraper_request_list.append({
                'method': 'POST',
                'url': CRAIGSLIST_SCRAPER_ENDPOINT,
                'json': {
                    'destination_bucket_dir': destination_bucket_dir,
                    'city': city.replace(' ', '').lower(),
                    'date': f'{date.month}-{date.day}-{date.year}',
                    # TODO: Attach min word count and such here
                }
            })

    print(scraper_request_list)

    # Send requests from list concurrently
    responses = handle_requests(scraper_request_list)

    print(responses)

    # Capture all scraped ad bucket paths
    ad_bucket_paths = []
    for response in responses:
        # Filter out TimeoutError that we can catch from workers who responded too slowly
        try:
            ad_bucket_paths += eval(response.decode('utf-8'))
        except:
            print(response)
            pass

    if ad_bucket_paths == []:
        raise ValueError('No ads collected')

    ##################
    # GENERATE POEMS #
    ##################

    # Request poem for each scraped ad's blob
    maker_request_list = []

    if all_of_day:
        for ad_bucket_path in ad_bucket_paths:
            maker_request_list.append({
                'method': 'POST',
                'url': POEM_MAKER_ENDPOINT,
                'json': {
                    'bucket_path': ad_bucket_path,
                    'destination_bucket_dir': destination_bucket_dir,
                    'image_flavor': image_flavor,
                    'voice': voice,
                    'pitch': pitch,
                    'speaking_rate': speaking_rate
                }
            })
    else:
        print('Not yet handling cases other than --all-of-day. Exiting...')
        exit()

    responses = handle_requests(maker_request_list)
    print(responses)

    # Capture all videos bucket paths
    video_bucket_paths = []
    for response in responses:
        # Filter out TimeoutError that we can catch from workers who responded too slowly
        try:
            response_string = response.decode('utf-8')
            if not response_string == '' and 'Exception' not in response_string and 'Rate' not in response_string:
                video_bucket_paths.append(response_string)
        except:
            print(response)
            pass

    print(video_bucket_paths)

    if video_bucket_paths == []:
        raise ValueError('No poems successfully made')

    # Grab the blobs
    video_blobs = [
        get_blob('craig-the-poet', bucket_path)
        for bucket_path in video_bucket_paths
    ]

    #########
    # ORDER #
    #########

    # Order the blobs by time of post
    def to_datetime(b):
        # e.g. 2019-12-25T21:27:07-0600
        datetime_string = b.metadata['ad-posted-time']
        date_string, time_string = datetime_string.split('T')
        year, month, day = date_string.split('-')
        hour, minute, second = time_string.split('-')[0].split(':')
        return datetime.strptime(
            f'{year}-{month}-{day}-{hour}-{minute}-{second}',
            "%Y-%m-%d-%H-%M-%S")

    video_blobs = sorted(video_blobs, key=to_datetime)

    ############
    # VALIDATE #
    ############

    # Check current sum of poem run time
    total_runtime = sum(
        float(blob.metadata['runtime']) for blob in video_blobs)

    if dont_post_if_runtime_under and total_runtime < dont_post_if_runtime_under:
        raise Exception('Minimum runtime length not met. Exiting...')

    ################
    # CONCAT POEMS #
    ################

    # Download all poems that we've selected
    makedir('poems')
    local_poem_filepaths = []
    for i, blob in enumerate(video_blobs):
        local_poem_filepath = f'poems/poem-{i}.mp4'
        blob.download_to_filename(local_poem_filepath)
        local_poem_filepaths += [local_poem_filepath]

    # Concat all the poem into one
    concat_videos(local_poem_filepaths, 'out.mp4', **FFMPEG_CONFIG)

    print('Concatenation complete')

    if upload_to_bucket_path:
        upload_file_to_bucket('craig-the-poet', 'out.mp4',
                              upload_to_bucket_path)


#    if no_upload:
#        return './out.mp4'

#####################
# UPLOAD TO YOUTUBE #
#####################

# Get information for description
    video_info_list = []
    current_runtime = 0.
    for i, blob in enumerate(video_blobs):
        runtime = float(blob.metadata['runtime'])
        video_info_list.append({
            'title': blob.metadata['ad-title'],
            'start-time': current_runtime
        })
        current_runtime += runtime

    title = f'Missed Connections {cities[0]} {date.month}-{date.day}-{date.year}'

    def float_to_youtube_time(s):
        minutes = str(int(s) // 60)
        seconds = str(int(s) % 60)
        return f'{minutes}:{"0" + seconds if len(seconds) != 2 else seconds}'

    description = f'My name is Craigothy, and I am a poet. These works are all inspired by true events.\n'
    for info in video_info_list:
        description += f'\n{float_to_youtube_time(info["start-time"])} -- {info["title"]}'
    description += '\n\nAll rights reserved for pleasure.'

    keywords = f'love,craigslist,poetry,humanity,craig'

    args = {
        # Default args for google-api-client namespace objects
        'auth_host_name': 'localhost',
        'auth_host_port': [8080, 8090],
        'category': '22',
        'logging_level': 'ERROR',
        'noauth_local_webserver': True,  # Default was False
        'privacyStatus': 'public',
        'file': 'out.mp4',
        'title': title,
        'description': description,
        'keywords': keywords
    }

    print(args)
    if no_youtube_upload:
        return './out.mp4'

    print('Uploading to YouTube...')
    upload_youtube_video(args)

    return 'Success'
예제 #7
0
def insert_gp(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False):
    '''
    insert GP records  into Records
    '''
    db = connect()
    status = True
    msg = ""
    try:
        dataset = read_json(abs_filename)
    except json.decoder.JSONDecodeError:
        status = False
        msg = "{} is corrupted. End file missing. Skipping insertion".format(abs_filename)
        # logger.warning(msg)
        return (status, msg, 0)

    games = set()
    header = None
    student = db.students.find_one({"student":student_id})
    try:
        group = student["group"]
    except KeyError:
        status = False
        msg = '[insert_gp] student {} has no group'.format(student)
        # logger.warning(')
        group = None
    
    values = 0
    records = []
    for key, v in dataset.items():    
        if len(v["records"]) > 0:
            step = db.path.find_one({"visualaudio":key})
            # send to report and call Cassandra
            if step is None:
                msg = "No chapter or lesson found in db.path with visualaudio=`{}`".format(key)
                if not "." in key:
                    step = db.path.find_one({"visual":key.split("-")[0]})
                    if step is None:
                        status = False
                        msg = "No chapter or lesson found in db.path with visual=`{}`".format(key.split("-")[0])
                        # logger.warning('[insert_gp] '+msg)
                        #insert_report(["warning", "insert_gp", abs_filename, "db.records", False, msg, len(v["records"]) ])
                        step = {"chapter":-1, "lesson":-1, "tag": key, "CV": ""}
                # syllabs handling: choose to affect it to first syllab ... :()
                else:
                    # print(key)
                    step = {"chapter":-1, "lesson":-1, "tag": key, "CV": "NA"}
                    # first_syllabs = key.split(".")
                    # step = db.path.find_one({"visualaudio":first_syllabs[0]})       
            game_ids = list(set([n["minigameId"] for n in v["records"]]))
            games.update(game_ids)
            for record in v["records"]:
                try:
                    record["stimulus_tag"] =  GP_TAGS[record["stimulus"]]
                    # print(record["stimulus_tag"])
                except KeyError:
                    status = False
                    msg = '[insert_gp] No tag in path found for stimulus_tag {}'.format(record["stimulus"])
                    # logger.warning(')
                    record["stimulus_tag"] = None            
                record["classroom"] = classroom_id
                record["student"] = student_id
                record["group"] = group
                record["chapter"] = int(step["chapter"])
                record["lesson"] = int(step["lesson"])
                record["tag"] = step["tag"]
                record["target_tag"] = step["tag"]
                record["value"] = key
                record["dataset"] ="gp"
                record["subject"] = "letters"
                record["CV"] = step["CV"]
                record["day"] = convert_to_date(record["unixTime"])
                record["unixTime"] = convert_to_isodate(record["unixTime"])
                record["isClicked"] = int(record["isClicked"] == True)
                record["game"] = str(record["minigameId"])
                del record["minigameId"]
                values += 1
                records.append(record)
    
    try:
        db.records.insert_many(records)
    except Exception as e:
        status = False
        msg = '[insert_gp] fails to insert {} records for student {}. Error  {}'.format(values, student_id, e)
    if values == 0:
        status = False 
        msg = "[insert_gp] has no records for student {}".format(student_id)
    return (status, msg, values)
예제 #8
0
def insert_assessments(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False):
    '''
    insert assessments
    '''
    header = None
    msg = ""
    records = []
    nb_values = 0
    db = connect()
    status = True
    msg = ""
    # rank = db.path.find_one({"subject": _dataset["subject"]}, sort=[("chapter", -1)])
    student = db.students.find_one({"student":student_id})
    if student is None:
        status = False
        msg = "[insert_assessements] student not found {}".format(student_id)
        
    else:
        try:
            group = student["group"]
        except KeyError:
            group = None
 
    with open(abs_filename, "r") as f:
        try:
            for row in f.readlines():
                line = json.loads(row)
                if len(line["records"]) > 0:
                    for record in line["records"]:
                        record.update(
                            {k: v for k, v in line.items() if k != "records"})
                        record["tag"] = record["value"]
                        record["chapter"] = int(record["chapterId"])
                        record["lesson"] = None
                        del record["chapterId"]
                        record.update({
                            "classroom": classroom_id,
                            "student": student_id,
                            "group": group,
                            "dataset": _dataset["dataset"],
                            "subject": _dataset["subject"],
                            "game" : "fish"

                            })
                        if _dataset["subject"] == "letters":
                            record["word"] = record["value"]
                        record["day"] = convert_to_date(record["unixTime"])
                        record["unixTime"] = convert_to_isodate(record["unixTime"])
                        record["assessmentEndTime"] = convert_to_isodate(
                                    record["assessmentEndTime"])
                        try:
                            records.append(record)
                            nb_values +=1 
                        except pymongo.errors.DuplicateKeyError:
                            pass
            if nb_values == 0:
                status = False
                msg = "[insert_assessements] No records found for {}".format(abs_filename)
            db.records.insert_many(records)
        except json.decoder.JSONDecodeError:
            msg = "{} is corrupted. JSON DECODE Error".format(abs_filename)
            # logger.warning("[insert_assessement]"+ msg)
            # insert_report(["error", "insert_assessments", abs_filename, "db.records", False, msg, "" ])
            return (False, msg, 0)
    return(status, msg, nb_values)
예제 #9
0
def insert_nb(_dataset, classroom_id, student_id, abs_filename, update=False, debug=False):
    '''
    insert numbers records into Records
    '''
    try:
        dataset = read_json(abs_filename)
    except json.decoder.JSONDecodeError:
        msg = "[insert_nb()] File {} is corrupted. JSON DECODE ERROR. Skipping insertion".format(abs_filename)
        # logger.warning(msg)
        return (False, msg, 0)
    db = connect()
    status = True
    msg = ""
    values = 0
    student = db.students.find_one({"student":student_id})
    try:
        group = student["group"]
    except KeyError:
        group = None

    header = None
    values = 0
    nb_records = []
    for nb,v in dataset.items():
        records = flatten_records(v)
        if len(records) > 0:
            try:
                step = db.path.find_one({"visual":int(nb)})
            except ValueError:
                step = None
                
            # send to report and call Cassandra
            if step is None:
                msg = "[insert_nb] No chapter or lesson found in db.path with visualaudio=`{}`".format(nb)
                # logger.warning("[insert_nb]"+ msg)
                step = db.path.find_one({"visual":nb.split("-")[0]})
                if step is None:
                    msg = "[insert_nb] No chapter or lesson found in db.path with visual=`{}`".format(nb.split("-")[0])
                    # logger.warning("[insert_nb]"+ msg)
                    step = {"chapter":-1, "lesson":-1, "tag": int(nb)}
            
            for record in records:
                if record["stimulus"] == "":
                    record["stimulus"] = None
                    record["stimulus_tag"] = None
                    
                else:
                    record["stimulus"] = int(record["stimulus"])
                    record["stimulus_tag"] =  NB_TAGS[int(record["stimulus"])]
                record["tag"] = int(nb)
                record["value"] = int(nb)
                record["classroom"] =  classroom_id
                record["student"] = student_id
                record["group"] = group
                record["target"] = int(nb)                
                record["target_tag"] = step["tag"]
                record["chapter"] =  step["chapter"]
                record["lesson"] =  step["lesson"]
                record["dataset"] = "numbers"
                record["subject"] =  "numbers"
                record["isClicked"] = int(record["isClicked"] == True)
                record["day"] = convert_to_date(record["unixTime"])
                record["unixTime"] = convert_to_isodate(record["unixTime"])
                record["game"] = str(record["minigameId"]) 
                record["CV"] = "N"
                del record["minigameId"]
                nb_records.append(record)
                values +=1
    if len(nb_records) > 0 and values > 0:
        db.records.insert_many(nb_records)    
    else:
        status, msg = False, "[insert_nb()] No records inserted in dataset numbers for student {}".format(student_id)
        # logger.info("[insert_gp]"+ msg)
    #     return(status, msg, 0)
    # return(status, msg, values)
    return status, msg, values