Exemplo n.º 1
0
def build_term(term):
  term_obj = Term()
  for variable, terminal in term:
    if variable:
      symbol_obj = Symbol(variable, True)
    elif terminal:
      symbol_obj = Symbol(terminal, False)
    else:
      print("WHY THE HELL ARE YOU HERE?! building term")
    term_obj.add(symbol_obj)
  return term_obj
Exemplo n.º 2
0
 def _get_term(data):
     if 'id' in data:
         term = Term.objects.get(pk=data['id'])
     else:
         term = Term()
     term.name_en = data.get('name_en', '')
     term.name_la = data.get('name_la', '')
     term.name_cs = data.get('name_cs', '')
     term.system = data.get('system', '')
     term.bodypart = data.get('body_part', '')
     term.fma_id = data.get('fma_id', -1)
     term.save()
     return term
Exemplo n.º 3
0
def get_translations(data, text, src_lang):
    # TODO fix dictionary map for all languages
    dmap = {
        'ru': 'english-russian',
        'fr': 'english-french',
        'de': 'english-german',
    }

    for lang, dictionary in dmap.items():
        pat = '{0}/dictionary/{1}/{2}'
        url = pat.format(base, dictionary, text.replace(' ', '-'))

        resp = requests.get(url, headers=headers)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, 'html.parser')
        for sense in soup.find_all('div', class_='sense-body'):
            phrase = sense.find('div', class_='phrase-block')
            if phrase: continue
            trans = sense.find('span', class_='trans')
            if trans:
                for word in stripped_text(trans).split(','):
                    term = Term(text=word, lang=lang, region=None)
                    data['translated_as'].append(term)

    return data
Exemplo n.º 4
0
def get_translations(text, src_lang):
    # TODO fix dictionary map for all languages
    dmap = {
        'ru': 'english-russian',
        'fr': 'english-french',
        'de': 'english-german',
    }

    txt = text.replace(' ', '-')
    for lang, dictionary in dmap.items():
        url = f'{base}/dictionary/{dictionary}/{txt}'

        resp = requests.get(url, headers=headers)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, 'html.parser')
        for sense in soup.find_all('div', class_='sense-body'):
            phrase = sense.find('div', class_='phrase-block')
            if phrase: continue
            trans = sense.find('span', class_='trans')
            if trans:
                words = stripped_text(trans).split(',')
                words = [w for w in words if not is_empty(w)]
                for word in words:
                    term = Term(text=word, lang=lang, region=None)
                    yield ('translated_as', term)
Exemplo n.º 5
0
def get_data(query, lang):
    if lang != 'en':
        return None

    url = f'https://www.macmillandictionary.com/dictionary/british/{query}'
    headers = {
        'User-Agent': 'script',
        'Accept': 'text/html',
    }
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, 'html.parser')

    #get transcription
    transcriptions = soup.find_all(class_='PRON')
    for t in transcriptions:
        yield ('transcription',
               Term(text=stripped_text(t).replace('/', ''),
                    lang=lang,
                    region=None))

    #get tags
    crop_text = stripped_text(soup.find(class_='zwsp'))
    #remove bad substring
    part_speech = stripped_text(soup.find(class_='PART-OF-SPEECH')).replace(
        crop_text, '')
    syntax_coding = stripped_text(soup.find(class_='SYNTAX-CODING'))

    yield ('tag', Term(text=part_speech, lang=lang, region=None))
    yield ('tag', Term(text=syntax_coding, lang=lang, region=None))

    #get defenition
    defenitions = soup.find_all(class_='DEFINITION')
    for d in defenitions:
        yield ('definition', Term(text=stripped_text(d),
                                  lang=lang,
                                  region=None))

    #get examples
    examples = soup.find_all(class_='EXAMPLES')
    for e in examples:
        yield ('in', Term(text=stripped_text(e), lang=lang, region=None))
    examples = soup.find_all(class_='PHR-XREF')
    for e in examples:
        yield ('in', Term(text=stripped_text(e), lang=lang, region=None))

    #get synonyms
    synonyms = soup.find_all(class_='synonyms')
    for allsyn in synonyms:
        subsynonyms = allsyn.find_all(class_='theslink')
        for syn in subsynonyms:
            if (not '...' in syn.text):
                yield ('synonym',
                       Term(text=stripped_text(syn), lang=lang, region=None))

    #get audio
    audio = soup.find(class_='audio_play_button')
    yield ('audio', File(url=audio['data-src-mp3'], region=None))
    yield ('audio', File(url=audio['data-src-ogg'], region=None))
Exemplo n.º 6
0
def define_word(text, lang='en', source_idx=-1, count=1):
    term_id = define_term(Term(text=text, lang=lang, region=None))
    source_list = sources if source_idx < 0 else sources[
        source_idx:source_idx + count]
    for source in source_list:
        data = get_data_safe(source, text, lang)
        if data is None:
            sys.exit(-1)
        push_data(term_id, data)
Exemplo n.º 7
0
def parse_phrase_row(row, lang, trans_lang, tags):
    def parse_td(td):
        # if 'class' not in td.attrs:
        #     return None
        # k = td.attrs['class']
        # if k not in ['phraselist1', 'phraselist2']:
        #     return None
        a = td.find('a')
        return stripped_text(a)

    result = [parse_td(t) for t in row.find_all('td')]
    if len(result) != 2:
        return []
    if any(t is None for t in result):
        return []
    term = Term(text=result[0], lang=lang, region=None)
    trans = Term(text=result[1], lang=trans_lang, region=None)
    return [TermWithData(term, {'tag': tags, 'translated_as': [trans]})]
Exemplo n.º 8
0
def achievements(request):
    current_term = Term.current_term_key()
    active = Participant.objects.filter(is_active=True)
    with_awards = active.filter(grant__term_id=current_term
            ).annotate(num_grants=Count('grant'), time=Max('grant__granted')
                    ).order_by('-num_grants', '-time')
    without_awards = active.exclude(grant__term_id=current_term)
    hide_nominate_link = request.REQUEST.get('hide_nominate_links', False)
    return render_to_response('achievements.html',
            {'participants': itertools.chain(with_awards, without_awards),
                'show_nominate_link': not hide_nominate_link},
            context_instance=RequestContext(request))
Exemplo n.º 9
0
def parse_thesaurus(lang, page):
    soup = BeautifulSoup(page, 'html.parser')

    dlist = soup.find_all('span', class_='syn-list')
    for d in dlist:
        synonyms = d.find_all('a')
        for s in synonyms:
            yield ('synonym',
                   Term(text=stripped_text(s), lang=lang, region=None))

    dlist = soup.find_all('span', class_='rel-list')
    for d in dlist:
        related = d.find_all('a')
        for r in related:
            yield ('related',
                   Term(text=stripped_text(r), lang=lang, region=None))

    dlist = soup.find_all('span', class_='ant-list')
    for d in dlist:
        antonyms = d.find_all('a')
        for r in antonyms:
            yield ('antonym',
                   Term(text=stripped_text(r), lang=lang, region=None))
Exemplo n.º 10
0
def achievements(request):
    current_term = Term.current_term_key()
    active = Participant.objects.filter(is_active=True)
    with_awards = active.filter(grant__term_id=current_term).annotate(
        num_grants=Count('grant'),
        time=Max('grant__granted')).order_by('-num_grants', '-time')
    without_awards = active.exclude(grant__term_id=current_term)
    hide_nominate_link = request.REQUEST.get('hide_nominate_links', False)
    return render_to_response(
        'achievements.html', {
            'participants': itertools.chain(with_awards, without_awards),
            'show_nominate_link': not hide_nominate_link
        },
        context_instance=RequestContext(request))
Exemplo n.º 11
0
 def __init__(self, *args, **kwargs):
     super(NominatePersonForm, self).__init__(*args, **kwargs)
     this_term = Term.current_term_key()
     term_grants = Grant.objects.filter(\
             participant = self.instance.participant,\
             term = this_term)
     term_user_nominations = Nomination.objects.filter(\
             nominator = self.instance.nominator,
             participant = self.instance.participant,
             term = this_term)
     self.fields['achievement'] = forms.ModelChoiceField(\
             queryset = Achievement.objects.filter(\
                 can_nominate=True).exclude(\
                 grant__in = term_grants).exclude(\
                 nomination__in = term_user_nominations))
Exemplo n.º 12
0
 def __init__(self, *args, **kwargs):
     super(NominatePersonForm, self).__init__(*args, **kwargs)
     this_term = Term.current_term_key()
     term_grants = Grant.objects.filter(\
             participant = self.instance.participant,\
             term = this_term)
     term_user_nominations = Nomination.objects.filter(\
             nominator = self.instance.nominator,
             participant = self.instance.participant,
             term = this_term)
     self.fields['achievement'] = forms.ModelChoiceField(\
             queryset = Achievement.objects.filter(\
                 can_nominate=True).exclude(\
                 grant__in = term_grants).exclude(\
                 nomination__in = term_user_nominations))
Exemplo n.º 13
0
def get_data(query, lang):
    if lang != 'en':
        return None

    data = {
        'audio': [],
        #'visual': [],
        'tag': [],
        'transcription': [],
        'definition': [],
        'in': [],
        'synonym': [],
        'antonym': [],
        'related': []
    }

    pat = 'https://www.merriam-webster.com/dictionary/{0}'

    url = pat.format(query)

    headers = {
        'User-Agent': 'script',
        'Accept': 'text/html',
    }
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, 'html.parser')

    #find transcription and audio
    prs = soup.find('span', class_='prs')

    transcription = prs.find('span', class_='pr')
    transcription = stripped_text(transcription)

    data['transcription'].append(transcription)

    btns = prs.find_all('a', class_='play-pron')
    urls = [parse_btn(b) for b in btns]
    urls = [u for u in urls if utils.url_exists(u)]
    for url in urls:
        data['audio'].append(File(url=url, region=None))

    #find definitions and 'in'

    vg = soup.find_all('div', class_='vg')

    for v in vg:
        definitions = v.find_all(class_='dt')
        for d in definitions:
            text = stripped_text(d)
            #all defenitions start with ':' with class mw_t_bc
            if (d.find(class_='mw_t_bc') is not None):
                text = text.lstrip(':').strip()
                #with defenitions we can take examples of text with class ex-sent, we need drop it
                if (d.find(class_='ex-sent') is not None):
                    text = text.split('\n')[0].strip()
                data['definition'].append(
                    Term(text=text, lang=lang, region=None))
    #parse examples
    data_in = soup.find_all(class_='ex-sent')
    for d in data_in:
        if ('t' in d['class']):
            data['in'].append(
                Term(text=stripped_text(d), lang=lang, region=None))
    #parse related
    ure = soup.find_all(class_='ure')
    for d in ure:
        data['related'].append(
            Term(text=stripped_text(d), lang=lang, region=None))
    #parse tags
    tag = soup.find_all('span', class_='fl')
    for d in tag:
        data['tag'].append(Term(text=stripped_text(d), lang=lang, region=None))

    #add tag with name 'word', becouse our name is word
    data['tag'].append(Term(text='word', lang=lang, region=None))

    #move to second page, in teasaurus
    pat_t = 'https://www.merriam-webster.com/thesaurus/{0}'
    url_t = pat_t.format(query)
    resp = requests.get(url_t, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, 'html.parser')

    dlist = soup.find_all('span', class_='syn-list')
    for d in dlist:
        synonyms = d.find_all('a')
        for s in synonyms:
            data['synonym'].append(
                Term(text=stripped_text(s), lang=lang, region=None))

    dlist = soup.find_all('span', class_='rel-list')
    for d in dlist:
        related = d.find_all('a')
        for r in related:
            data['related'].append(
                Term(text=stripped_text(r), lang=lang, region=None))

    dlist = soup.find_all('span', class_='ant-list')
    for d in dlist:
        antonyms = d.find_all('a')
        for r in antonyms:
            data['antonym'].append(
                Term(text=stripped_text(r), lang=lang, region=None))

    return data
Exemplo n.º 14
0
def define_word(text, lang='en', source_idx=-1):
    term_id = define_term(Term(text=text, lang=lang, region=None))
    source_list = sources if source_idx < 0 else [sources[source_idx]]
    for source in source_list:
        data = source.get_data(text, lang)
        push_data(term_id, data)
Exemplo n.º 15
0
def get_data(query, lang):
    if lang != 'en':
        return

    url = f'https://www.merriam-webster.com/dictionary/{query}'

    headers = {
        'User-Agent': 'script',
        'Accept': 'text/html',
    }
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, 'html.parser')

    #find transcription and audio
    prs = soup.find('span', class_='prs')

    if prs is not None:
        transcription = prs.find('span', class_='pr')
        transcription = stripped_text(transcription)

        yield ('transcription',
               Term(text=transcription, lang='ipa', region=None))

        btns = prs.find_all('a', class_='play-pron')
        urls = [parse_btn(b) for b in btns]
        urls = [u for u in urls if utils.url_exists(u)]
        for url in urls:
            yield ('audio', File(url=url, region=None))

    #find definitions and 'in'

    vg = soup.find_all('div', class_='vg')

    for v in vg:
        definitions = v.find_all(class_='dt')
        for d in definitions:
            text = stripped_text(d)
            #all defenitions start with ':' with class mw_t_bc
            if (d.find(class_='mw_t_bc') is not None):
                text = text.lstrip(':').strip()
                #with defenitions we can take examples of text with class ex-sent, we need drop it
                if (d.find(class_='ex-sent') is not None):
                    text = text.split('\n')[0].strip()
                yield ('definition', Term(text=text, lang=lang, region=None))
    #parse examples
    data_in = soup.find_all(class_='ex-sent')
    for d in data_in:
        if ('t' in d['class']):
            yield ('in', Term(text=stripped_text(d), lang=lang, region=None))
    #parse related
    ure = soup.find_all(class_='ure')
    for d in ure:
        yield ('related', Term(text=stripped_text(d), lang=lang, region=None))
    #parse tags
    tag = soup.find_all('span', class_='fl')
    for d in tag:
        yield ('tag', Term(text=stripped_text(d), lang=lang, region=None))

    #add tag with name 'word', becouse our name is word
    yield ('tag', Term(text='word', lang=lang, region=None))

    #move to second page, in teasaurus
    url_t = f'https://www.merriam-webster.com/thesaurus/{query}'
    resp = requests.get(url_t, headers=headers)
    if resp.ok:
        for t in parse_thesaurus(lang, resp.text):
            yield t
Exemplo n.º 16
0
import sys
import utils
import requests
import json
from bs4 import BeautifulSoup
from models import Term, TermWithData

headers = {
    'User-Agent': utils.CHROME_USER_AGENT,
    'Accept': 'text/html',
}

# TODO consider collecting automatically https://www.multitran.com/m.exe?s=place&l1=2&l2=1&fl=1
categories = [{
    'tag': [
        Term(text='idiom', lang='en', region=None),
        Term(text='идиома', lang='ru', region=None),
    ],
    'id':
    895,
}, {
    'tag': [
        Term(text='proverb', lang='en', region=None),
        Term(text='пословица', lang='ru', region=None),
    ],
    'id':
    310,
}, {
    'tag': [
        Term(text='americanism', lang='en', region=None),
        Term(text='американизм', lang='ru', region=None),
Exemplo n.º 17
0
def query(term_id):
    query = (Term.select(Term.title).where(Term.id == term_id).dicts())

    return query
Exemplo n.º 18
0
def get_data(text, lang):
    if lang != 'en':
        return

    txt = text.replace(' ', '-')
    url = f'{base}/dictionary/english/{txt}'

    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    codes = {
        'C': 'countable',
        'U': 'uncountable',
        'S': 'singular',
    }
    posgram_found = False
    gram_found = False

    if utils.is_word(text):
        yield ('tag', Term(text='word', lang=lang, region=None))

    soup = BeautifulSoup(resp.text, 'html.parser')
    page = soup.find('div', class_='page')
    for dictionary in page.find_all('div', class_='dictionary'):
        header = dictionary.find('div', class_='pos-header')
        body = dictionary.find('div', class_='pos-body')

        posgram = header.find('div', class_='posgram')
        if posgram and not posgram_found:
            pos = find_strip(posgram, 'span', class_='pos')
            term = Term(text=pos, lang=lang, region=None)
            yield ('tag', term)
            posgram_found = True
        if not gram_found:
            for gram in body.find_all('span', class_='gram'):
                for gc in gram.find_all('span', class_='gc'):
                    code = stripped_text(gc)
                    if code in codes and not gram_found:
                        term = Term(text=codes[code], lang=lang, region=None)
                        yield ('tag', term)
                        gram_found = True

        # parse pronunciations
        for dpron in header.find_all('span', class_='dpron-i'):
            region = find_strip(dpron, 'span', 'region')
            amp = header.find('amp-audio')
            for source in amp.find_all('source'):
                file = File(url=base + source.attrs['src'], region=region)
                yield ('audio', file)

            ipa = find_strip(dpron, 'span', class_='ipa')
            if not is_empty(ipa):
                yield ('transcription', Term(text=ipa,
                                             lang=lang,
                                             region=region))

        for dblock in body.find_all('div', class_='def-block'):
            def_text = stripped_text(dblock.find('div', class_='def'))
            if not is_empty(def_text):
                yield ('definition', Term(text=def_text,
                                          lang=lang,
                                          region=None))
            img = dblock.find('amp-img')
            if img is not None:
                file = File(url=base + img.attrs['src'], region=None)
                yield ('visual', file)
            for eg in dblock.find_all('span', 'eg'):
                term = Term(text=stripped_text(eg), lang=lang, region=None)
                yield ('in', term)

    for dataset in page.find_all('div', class_='dataset'):
        for eg in dataset.find_all('span', class_='deg'):
            term = Term(text=stripped_text(eg), lang=lang, region=None)
            yield ('in', term)
        cpegs = dataset.find('div', class_='cpegs')
        if cpegs:
            for lbb in cpegs.find_all('div', class_='lbb'):
                for a in lbb.find_all('a', class_='hdib'):
                    term = Term(text=stripped_text(a), lang=lang, region=None)
                    yield ('collocation', term)

    for t in get_translations(text, lang):
        yield t
    def get_all_terms(self):
        query = (Term.select(Term.title, Term.id).dicts())

        return query