示例#1
0
 def scrape_comments(self):
     print("Scraping comments...")
     count = counter(len(self.thread_urls))
     for thread_url in self.thread_urls:
         self.scrape_comment(thread_url)
         next(count)
     print("\nScraped {} comments. {} comments proved unavailable".format(
         len(self.comments), len(self.unavailable_comments)))
     for thread_url in self.unavailable_comments:
         print(thread_url)
def fix_do_transls():
    coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS]
    transls = load_utf_json(DO_JSON).items()
    count = counter(len(transls))
    for verb, transl in transls:
        next(count)
        match = coll.find({VERB: verb, TRANSL: 'do'})[0]
        match[TRANSL] = transl
        coll.save(match)
    print()
示例#3
0
def filter_verbs(filtered_list_json,
                 unfiltered_list_json=UNFILTERED_WIKILEXICO_LIST_JSON):
    print("Filtering {} --> {}".format(unfiltered_list_json,
                                       filtered_list_json))
    count = counter()
    coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS]
    for verb in read_json_lines(unfiltered_list_json):
        next(count)
        if not coll.find({VERB: verb}).count():
            yield verb
    print()
def collect_do_transls():
    pattern = re.compile(r'\((.+?)\)')
    doed = load_utf_json(DO_JSON)
    transls = dict()
    count = counter(len(doed))
    for verb in doed:
        next(count)
        transls[verb] = pattern.findall(
            BeautifulSoup(
                requests.get('https://cooljugator.com/gr/' + verb).content,
                'lxml').find('span', id='mainform').text)[0]
    print()
    dump_utf_json(transls, COOLJUGATOR_DO_TRANSL)
def edit_field(fieldname, func, fltr=None, dbname=DB_NAME, collname=VERBS):
    if not fltr:
        fltr = dict()
    print("{}.{}: editing '{}' with '{}'...".format(dbname, collname,
                                                    fieldname, func.__name__))
    target = MongoClient(LOCALHOST, PORT)[dbname][collname]
    cursor = target.find(fltr)
    count = counter(cursor.count())
    for entry in cursor:
        next(count)
        entry[fieldname] = func(entry.get(fieldname))
        target.save(entry)
    print()
def get_fieldnames(list_json=COOLJUGATOR_LIST_JSON,
                   fieldnames_json='cooljugator_fieldnames.json'):
    fieldnames = set()
    verbs = load_utf_json(list_json)
    count = counter(len(verbs))
    for verb, _ in verbs:
        next(count)
        for cell in BeautifulSoup(
                requests.get('https://cooljugator.com/gr/' + verb).content,
                'lxml').find_all('div', {'class': 'conjugation-cell'}):
            try:
                fieldnames.add(cell.attrs['id'])
            except KeyError:
                pass
    dump_utf_json(sorted(list(fieldnames)), fieldnames_json)
def remove_morphologically_abnormal_verbs():
    abnormal_count = 0
    coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS]
    count = counter(coll.count())
    for entry in coll.find():
        next(count)
        verbs = entry[VERB]
        if isinstance(verbs, str):
            verbs = [verbs]
        for verb in verbs:
            if not (verb.endswith('ω') or verb.endswith('ώ')
                    or verb.endswith('αι')):
                coll.delete_one({VERB: verb})
                abnormal_count += 1
    print("\nRemoved {} abnormal verbs".format(abnormal_count))
def print_verbs(fieldname, fltr, func=None, dbname=DB_NAME, collname=VERBS):
    match = MongoClient(LOCALHOST, PORT)[dbname][collname].find(fltr)
    total = match.count()
    if func:
        res = list()
        count = counter(total)
        for entry in match:
            next(count)
            if func(entry):
                res.append(entry[fieldname])
        print("\n{} matching items".format(len(res)))
    else:
        print(total, "matching entries")
        res = [entry[fieldname] for entry in match]
    for item in res:
        print(item)
def copy_collection(target_collname,
                    dbname=DB_NAME,
                    source_collname=VERBS,
                    indices=(VERB, PARADIGM)):
    print("[{}]: copying [{}] to [{}]...".format(dbname, source_collname,
                                                 target_collname))
    assert target_collname != source_collname, "Collections should not have identical names"
    database = MongoClient(LOCALHOST, PORT)[dbname]
    target_coll = database[target_collname]
    target_coll.drop()
    source_coll = database[source_collname]
    count = counter(source_coll.count())
    for entry in source_coll.find():
        next(count)
        target_coll.insert(entry)
    add_indices(target_coll, indices)
示例#10
0
def collect_active_voice_paradigms(raw_paradigm_json,
                                   list_json=FILTERED_WIKILEXICO_LIST_JSON):
    count = counter()
    for verb in read_json_lines(list_json):
        next(count)
        try:
            paradigm = get_paradigm(verb, active_voice=True)
        except Exception as e:
            print()
            print(verb)
            print()
            raise e
        if paradigm:
            entry = get_shortened(paradigm)
            entry[PARADIGM] = paradigm
            yield entry
示例#11
0
def upload(source_json,
           source,
           db_name=DB_NAME,
           coll_name=VERBS,
           drop=False,
           indices=(VERB, PARADIGM)):
    target = MongoClient(LOCALHOST, PORT)[db_name][coll_name]
    if drop:
        target.drop()
    print('Initially,', target.count(), 'entries')
    count = counter()
    for line in read_json_lines(source_json):
        next(count)
        line[SOURCE] = source
        target.insert(line)
    add_indices(target, indices)
    print('\nCurrently,', target.count(), 'entries')
def collect_duplicates():
    visited = set()
    duplicates = set()
    coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS]
    count = counter(coll.count())
    for entry in coll.find():
        next(count)
        verbs = entry[VERB]
        if isinstance(verbs, str):
            verbs = [verbs]
        for verb in verbs:
            if verb in visited:
                duplicates.add(verb)
            else:
                visited.add(verb)
    print("\nDumping {} duplicates".format(len(duplicates)))
    dump_utf_json(sorted(list(duplicates)), DUPLICATES_JSON)
def add_field(fieldname,
              fieldcontent,
              fltr=None,
              dbname=DB_NAME,
              collname=VERBS):
    if not fltr:
        fltr = dict()
    print("{}.{}: setting '{}' to \"{}\"...".format(dbname, collname,
                                                    fieldname, fieldcontent))
    target = MongoClient(LOCALHOST, PORT)[dbname][collname]
    cursor = target.find(fltr)
    count = counter(cursor.count())
    for entry in cursor:
        next(count)
        entry[fieldname] = fieldcontent
        target.save(entry)
    print()
def collect_paradigms(raw_paradigm_json, list_json=COOLJUGATOR_LIST_JSON):
    verbs = load_utf_json(list_json)
    exceptions = dict()
    count = counter(len(verbs))
    for verb, transl in verbs:
        next(count)
        paradigm, errors = get_paradigm(verb)
        paradigm.update({VERB: verb, TRANSL: transl})
        if errors:
            exceptions[verb] = errors
        yield paradigm
    if exceptions:
        print('\n\nExceptions:')
        for exception in exceptions:
            print(exception)
            for item in exceptions[exception]:
                print("     {}".format(item))
            print()
示例#15
0
def collect_verbs(unfiltered_list_json):
    print("Collecting verbs to", unfiltered_list_json)
    page_url = 'https://el.wiktionary.org/w/index.php?title=%CE%9A%CE%B1%CF%84%CE%B7%CE%B3%CE%BF%CF%81%CE%AF%CE%B1:' \
               '%CE%A1%CE%AE%CE%BC%CE%B1%CF%84%CE%B1_(%CE%BD%CE%AD%CE%B1_%CE%B5%CE%BB%CE%BB%CE%B7%CE%BD%CE%B9%CE%BA' \
               '%CE%AC)&from=%CE%B1#mw-pages'
    count = counter()
    while True:
        soup = BeautifulSoup(requests.get(page_url).content, 'lxml')
        for a_tag in soup.find_all('a', href=True):
            a_text = a_tag.text
            if a_tag.get('title') == a_text:
                next(count)
                yield a_text
        try:
            page_url = 'https://el.wiktionary.org' + soup.find(
                'a', href=True, text="επόμενη σελίδα")['href']
        except TypeError:
            break
    print()