Exemplo n.º 1
0
def schools(data, town):
    schools = {}
    for school_type, content in data.items():
        for entry in content:
            township = _township_for_name(town, entry[constants.TOWNSHIP_COL])
            open_data_id = OpenDataId(entry[constants.OPEN_DATA_ID_COL])
            if open_data_id in schools.keys():
                raise KeyError("Found duplicate open data id: %d",
                               open_data_id)
            school = School(id=open_data_id,
                            township_id=township.id,
                            type=school_type,
                            name=entry[constants.NAME_COL],
                            name_lt=to_latin(entry[constants.NAME_COL]),
                            address=entry[constants.ADDRESS_COL],
                            address_lt=to_latin(entry[constants.ADDRESS_COL]),
                            place=entry[constants.PLACE_COL],
                            place_lt=to_latin(entry[constants.PLACE_COL]),
                            postcode=entry[constants.POSTCODE_COL],
                            website=entry[constants.WEBSITE_COL],
                            phone=entry[constants.PHONE_COL],
                            email=entry[constants.EMAIL_COL])
            schools[open_data_id] = school
    print("Found %d schools" % len(schools))
    return schools
Exemplo n.º 2
0
    def data_importer_of_municipality_prijepolje(self):
        db.opstine.remove({"opstina.latinica": "Prijepolje", "tipPodataka.slug": "rashodi"})
        # Read data from vranje csv file
        data_handler = reader(open("data/rashodi/prijepolje.csv", "r"), delimiter=",")
        program = ""
        subprogram = ""
        for index, row in enumerate(data_handler):
            if index > 0:
                if row[1] in ["", " "] and row[2] not in ["", " "] and row[2].strip() in utils.program_categories_for_prijepolje().keys():
                    program = row[2].strip()

                if program != "" and row[2].strip() in utils.program_categories_for_prijepolje()[program]:
                    subprogram = row[2].strip()

                if row[1] not in ["", " "] and len(row[1]) > 2 and program not in ["", " "] and subprogram not in ["", " "]:
                    json_doc = self.build_mongo_document_structure_for_prihodi_rashodi(
                        "Пријепоље",
                        row[1],
                        row[2],
                        row[3],
                        row[4],
                        row[5],
                        row[6],
                        None
                    )
                    json_doc["program"] = {}
                    json_doc["program"]["cirilica"] = program.strip()
                    json_doc["program"]["latinica"] = cyrtranslit.to_latin(program, "sr")
                    json_doc["potProgram"] = {}
                    json_doc["potProgram"]["cirilica"] = subprogram.strip()
                    json_doc["potProgram"]["latinica"] = cyrtranslit.to_latin(subprogram, "sr")
                    db.opstine.insert(json_doc)
                    print "Opstine: %s - Program: %s %s" % ("Пријепоље", program, row[1])
    def build_docs(self, row):

        # Clean expense string so that is is numerical (e.g. turn blank string to 0).
        cost = row[9].replace(',', '')
        if not cost.strip():
            cost = 0

        # Create doc.
        doc = {
            'region': {
                'name': self.get_region(),
                'slug': slugify(self.get_region(), to_lower=True),
                'subregion':{
                    'name': cyrtranslit.to_latin(row[0]),
                    'slug': cyrtranslit.to_latin(slugify(row[0], to_lower=True)),
                }
            },
            'activity':{
                'id': int(row[1]),
                'description': cyrtranslit.to_latin(row[2])
            },
            'dataset': {
                'name': self.get_dataset(),
                'slug': slugify(self.get_dataset(), to_lower=True)
            },
            'cost': cost,
            'year': 2010
        }

        # Console output to provide user with feedback on status of importing process.
        print '%s - %s: %s (%s %i)' % (doc['activity']['id'], doc['activity']['description'], doc['cost'], doc['region']['name'], doc['year'])

        return [doc]
Exemplo n.º 4
0
def gen_email_addr(frm):

    frm_addr_frst = [
        'info', 'process', 'warning', 'vzyskanie', 'shtraf', 'dolg', 'alarm',
        'zapros', 'request', 'tax', 'nedoimka', 'uvedomlenie'
    ]
    frm_dmns = ['com', 'net', 'ru', 'org']
    addr = random.choice(frm_addr_frst).rstrip()
    tmp = frm.replace('"', '')
    if any(c in frm for c in ("суд", "Суд")):
        tmp = cyrtranslit.to_latin(tmp, 'ru')
        tmp = tmp.replace("'", '')
        tmp = tmp.split(' ')
        i = 0
        for s in tmp:
            tmp[i] = s[:1].lower()
            i += 1
        addr += "@" + ''.join(tmp).rstrip() + ".court.gov.ru"
    elif any(c in frm for c in ("инспекция", "ИФНС")):
        addr += "@ifns" + frm.rstrip()[-2:] + ".gov.ru"
    elif any(c in frm for c in ("банк", "Банк")):
        tmp = cyrtranslit.to_latin(tmp, 'ru').lower()
        tmp = tmp.replace("'", '')
        if tmp[:4] == "bank":
            tmp = tmp.split(tmp[:4], 1)[1]
        addr += "@" + tmp.replace(' ',
                                  '').rstrip() + "." + random.choice(frm_dmns)
    else:
        tmp = cyrtranslit.to_latin(tmp, 'ru').lower()
        tmp = tmp.replace("'", '')
        addr += "@" + tmp.replace(' ',
                                  '').rstrip() + "." + random.choice(frm_dmns)
    return addr
Exemplo n.º 5
0
def autor_pesme(a, datoteka):
    with open(datoteka, 'w') as x:
        for author in authors:
            curr_path = '{}/{}'.format(myroot, author)
            if curr_path.endswith('{}'.format(a)):
                albums = [
                    dir for dir in listdir(curr_path)
                    if not isfile(join(curr_path, dir))
                ]
                for album in albums:
                    album_path = '{}/{}'.format(curr_path, album)
                    songs = [
                        f for f in listdir(album_path)
                        if isfile(join(album_path, f))
                    ]
                    for song in songs:
                        if not song.startswith('.DS_S'):
                            song_path = '{}/{}'.format(album_path, song)
                            for stih in open(
                                    song_path, encoding="utf8",
                                    errors='ignore').read().split('\n')[:-1]:
                                line = '{}'.format(stih)
                                print(cyrtranslit.to_latin(line))
                                x.write(cyrtranslit.to_latin(line) + '\n')
            else:
                pass
Exemplo n.º 6
0
def utmnamecreate(country, city, countvvod, utm_campaign, utm_term,
                  utm_content):
    url = cyrtranslit.to_latin(str(Urlname.objects.last().name),
                               'ru').replace(" ", "")
    global k
    dataframe = pd.DataFrame(k[country][1:], columns=k[country][0])
    print(dataframe.loc[dataframe['Название'] == city, 'utm_source'])
    utmname = url + '?' + 'utm_source=' + str(dataframe.loc[
        dataframe['Название'] == city,
        'utm_source'].item()) + '&utm_medium=' + str(
            dataframe.loc[dataframe['Название'] == city, 'utm_medium'].item()
        ) + '&utm_campaign=' + str(dataframe.loc[
            dataframe['Название'] == city,
            'utm_campaign'].item()) + '&utm_term=' + str(
                dataframe.loc[dataframe['Название'] == city,
                              'utm_term'].item()) + '&utm_content=' + str(
                                  dataframe.loc[dataframe['Название'] == city,
                                                'utm_content'].item()) + '&'
    if countvvod == 1:
        find = re.findall(r'введите.*?[&]', utmname)
        newline = utmname.replace(find[0], utm_campaign + '&')
        itogfirst = deleteNO(newline[:-1], re.findall(r'[^&]*нет',
                                                      newline[:-1]))
        itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "")
        return itog

    elif countvvod == 2:
        find = re.findall(r'введите.*?[&]', utmname)
        newline = utmname.replace(find[0], utm_campaign + '&')
        newline1 = newline.replace(find[1], utm_term + '&')
        itogfirst = deleteNO(newline1[:-1],
                             re.findall(r'[^&]*нет', newline1[:-1]))
        itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "")
        return itog

    elif countvvod == 3:
        find = re.findall(r'введите.*?[&]', utmname)
        newline = utmname.replace(find[0], utm_campaign + '&')
        newline1 = newline.replace(find[1], utm_term + '&')
        newline2 = newline1.replace(find[2], utm_content + '&')
        itogfirst = deleteNO(newline2[:-1],
                             re.findall(r'[^&]*нет', newline2[:-1]))
        itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "")
        return itog
    else:
        itogfirst = deleteNO(utmname[:-1], re.findall(r'[^&]*нет',
                                                      utmname[:-1]))
        itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "")
        return itog
Exemplo n.º 7
0
    def test_alphabet_transliteration_cyrillic_to_latin(self):
        ''' Transliteration of entire cyrillic alphabet to latin.
        '''
        transliterated_alphabet = cyrtranslit.to_latin(macedonian_alphabet_cyrillic, lang_code='mk')

        # transliterated_alphabet =  u's\u0301' 's\xcc\x81'
        self.assertEqual(transliterated_alphabet, macedonian_alphabet_latin)
Exemplo n.º 8
0
def cyr2latTranslate(src_dir, dest_dir):
    # print(f"D {src_dir} -> {dest_dir}")
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    for item in os.listdir(src_dir):
        extension = os.path.splitext(item)[1][1:]
        s = os.path.join(src_dir, item) 
        if os.path.isdir(s):
            d = os.path.join(dest_dir, item)
            cyr2latTranslate(s, d)
        else:
            d = os.path.join(dest_dir, item)
            shutil.copyfile(s,d)
            f = open(s, encoding="utf8")
            content = f.read()
            newF = open(d, "w", encoding="utf8")
            newF.truncate(0)
            
            if extension in extensionList:
                new_content = cyrtranslit.to_latin(content, 'sr')
                if(extension == "rst"):
                    new_content = title_fix(new_content)
                newF.write(new_content)
            else:
                newF.write(content)
            newF.close()
Exemplo n.º 9
0
    def save(self, commit=True):
        user = super(CreateUserForm, self).save(commit=False)
        user.set_password("password")
        user.is_active = True

        full_name = self.cleaned_data.get('full_name')
        full_name_en = cyrtranslit.to_latin(full_name, 'ru')
        full_name_clean = full_name_en.replace('#', '')
        full_name_clean = full_name_clean.replace("'", "")

        fio = full_name_clean.split()
        if len(fio) == 2:
            username = fio[1][0].lower() + fio[0].lower()
        else:
            username = fio[1][0].lower() + fio[2][0].lower() + fio[0].lower()
        same_name_users = User.objects.filter(username__iexact=username)
        if same_name_users:
            username += str(randrange(100))
        user.username = username

        if commit:
            user.save()
            user.profile.birth_date = self.cleaned_data.get("birth_date")
            user.profile.full_name = self.cleaned_data.get("full_name")
            user.profile.user_type = self.cleaned_data.get("user_type")
            user.profile.save()
        return user
Exemplo n.º 10
0
    def save(self, *args, **kwargs):

        cyr = cyrtranslit.to_latin(self.option_name, 'ru')
        stri = slugify(cyr)
        self.slug = stri

        super(Option, self).save(*args, **kwargs)
Exemplo n.º 11
0
def clean(txt, tagger, le):
    tweet_text = re.sub("@[A-Za-z0-9_-]+", "", txt)  # remove mentions
    tweet_text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+",
                        "", tweet_text)  # remove links
    tweet_text = " ".join(tweet_text.split())
    tweet_text = ''.join(
        c for c in tweet_text if c not in emoji.UNICODE_EMOJI)  # remove emojis
    tweet_text = tweet_text.replace("#", "")  # remove hashtags
    tweet_text = re.sub(r'[^\w\s]', '', tweet_text)  # remove punctuation
    tweet_text = tweet_text.replace("_", " ")
    tweet_text = tweet_text.replace('RT ', '')  # remove retweets
    # convert to latin scrypt
    tweet_text = [cyrtranslit.to_latin(w.lower()) for w in tweet_text.split()]
    results = list()
    for word in tweet_text:
        i = mapping.get(word)
        if i:
            line = linecache.getline(WORD_MODEL, i).strip()
            vec = np.array([float(n) for n in line.split()[1:]])
            pred = le.inverse_transform(tagger.predict([vec]))[0]
            result = lemmatize(word, pred)
            results.append(result)
    if not results:
        return ''
    return ' '.join(results)
Exemplo n.º 12
0
    def test_mix_characters(self):
        ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't.
        '''

        transliterated_mix = cyrtranslit.to_latin(mix_characters_some_cyrillic)

        self.assertEqual(transliterated_mix, mix_characters_all_latin)
def main():
    working_path = Path(__file__).resolve().parent

    localization_path = Path(get_loc_dir_path())
    cyrillic_path = localization_path / CYRILLIC_CODE
    latin_path = localization_path / LATIN_CODE

    shutil.rmtree(latin_path, ignore_errors=True)
    os.makedirs(latin_path)

    _, _, files = next(os.walk(cyrillic_path))
    for file in files:
        with open(cyrillic_path / file, 'r',
                  encoding='utf-8') as cyrillic, open(
                      latin_path / file, 'w', encoding='utf-8') as latin:
            text = cyrillic.read()
            latin.write(to_latin(text))

    cyrillic_zip_path = working_path / f'{CYRILLIC_CODE}-{VERSION}'
    os.chdir(cyrillic_path)
    shutil.make_archive(cyrillic_zip_path, 'zip', cyrillic_path)

    cyrillic_zip_path = working_path / f'{LATIN_CODE}-{VERSION}'
    os.chdir(latin_path)
    shutil.make_archive(cyrillic_zip_path, 'zip', latin_path)
Exemplo n.º 14
0
    def test_alphabet_transliteration_cyrillic_to_latin(self):
        ''' Transliteration of entire cyrillic alphabet to latin.
        '''
        transliterated_alphabet = cyrtranslit.to_latin(
            bulgarian_alphabet_cyrillic, lang_code='bg')

        self.assertEqual(transliterated_alphabet, bulgarian_alphabet_latin)
Exemplo n.º 15
0
    def get(self, request, *args, **kwargs):
        activate_language(request.session)
        competition = Competition.objects.get(pk=kwargs['comp'])
        if request.user.is_authenticated and (
                request.user.is_admin or request.user.id == int(kwargs['pk'])
                or competition.created_by == request.user.id):
            try:
                user = User.objects.get(pk=kwargs['pk'])

                badge_path = getBadge(user, competition)
                file_wrapper = FileWrapper(open(badge_path, 'rb'))
                file_mimetype = mimetypes.guess_type(badge_path)
                response = HttpResponse(file_wrapper,
                                        content_type=file_mimetype)
                response['X-Sendfile'] = badge_path
                response['Content-Length'] = os.stat(badge_path).st_size
                response[
                    'Content-Disposition'] = 'attachment; filename={}'.format(
                        '{}\'s_badge.png'.format(
                            cyrtranslit.to_latin(
                                remove_ukrainian(user.get_full_name()), 'ru')))
                return response
            except Exception as ex:
                pass
        raise Http404
Exemplo n.º 16
0
    def test_special_diacritic_characters(self):
        ''' Diacritic characters should remain the same.
        '''
        transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars,
                                                              lang_code='tj')

        self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
Exemplo n.º 17
0
def save_key_frames(video):
    """
    Saves the frames that are estimated holds as image files and returns a list of their names
    (NB: only frames in the first half of the list of key frames are saved, as later frames are assumed
    to constitute final rest position)
    """
    outfile = video.split(".")[0]
    outfile = outfile.split("_")[1]
    outfile = cyrtranslit.to_latin(outfile, 'ru')
    all_frames, offset = get_key_frames(video)

    frames = [el+offset for el in all_frames] # Uncomment if you want all key frames to be included
    print(frames)
    #frames = all_frames[:math.ceil(len(all_frames)/2)] # Comment out if you want all key frames included
    count = 1
    filenames = []
    for f in frames:
        v = cv2.VideoCapture(video)
        v.set(1,f-1)
        ret,frame = v.read()
        filename = outfile+"_frame"+str(count)+".jpg"
        cv2.imwrite(filename, frame)
        filenames.append(filename)
        count += 1
    return filenames
Exemplo n.º 18
0
def to_eng(line):
    out = cyrtranslit.to_latin(line, 'ru')
    out = "".join(
        c for c in out
        if c not in ['!', '.', ':', "'", '?', ' ', '-', '\'', ',', '\n'])

    return out
Exemplo n.º 19
0
def townships(data, reg):
    names = set()
    for content in data.values():
        for entry in content:
            township_name = entry[constants.TOWNSHIP_COL]
            names.add(township_name)
    name_to_id = {}
    for i, name in enumerate(sorted(names)):
        name_to_id[name] = TownshipId(i + 1)
    townships = {}
    for content in data.values():
        for entry in content:
            township_name = entry[constants.TOWNSHIP_COL]
            township_id = name_to_id[township_name]
            region_name = entry[constants.REGION_COL]
            region = _region_for_name(reg, region_name)
            if township_id in townships.keys():
                township = townships[township_id]
                if township.region_id != region.id:
                    raise ValueError(
                        "Found same township for different regions: %s -> (%d, %d)"
                        % (township_name, township.region_id, region.id))
            else:
                townships[township_id] = Township(
                    id=township_id,
                    name=township_name,
                    name_lt=to_latin(township_name),
                    region_id=region.id)
    print("Found %d townships" % len(townships))
    return townships
Exemplo n.º 20
0
def post_process(word, language):
    """
    Post Processing steps
    TODO: Clean up and modularize
    :param word:
    :param language:
    :return:
    """
    if language in config.get_languages_by_property("transliterate", "cyr"):
        word = cyrtranslit.to_latin(word, language).casefold()
    if language in config.get_languages_by_property("transliterate", "trans"):
        word = translit(word, language, reversed=True).casefold()
    if language in config.get_languages_if_property_exists("accents"):
        word = unidecode.unidecode(word).casefold()
    if language in config.get_languages_if_property_exists("stopwords"):
        res = word.casefold().split()
        res = list(
            filter(
                lambda x: x not in config.get_language(language)["stopwords"],
                res))
        word = " ".join(res)
    if language in config.get_languages_if_property_exists("stopwords-arabic"):
        res = word.casefold().split("-")
        res = list(filter(lambda x: x not in ["al"], res))
        word = " ".join(res)
    if language in config.get_languages_if_property_exists(
            "space-elimination"):
        word = "".join(word.split())
    return word
    def test_mix_characters(self):
        ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't.
        '''

        transliterated_mix = cyrtranslit.to_latin(mix_characters_some_cyrillic)

        self.assertEqual(transliterated_mix, mix_characters_all_latin)
    def test_alphabet_transliteration_cyrillic_to_latin(self):
        ''' Transliteration of entire cyrillic alphabet to latin.
        '''
        transliterated_alphabet = cyrtranslit.to_latin(macedonian_alphabet_cyrillic, lang_code='mk')

        # transliterated_alphabet =  u's\u0301' 's\xcc\x81'
        self.assertEqual(transliterated_alphabet, macedonian_alphabet_latin)
Exemplo n.º 23
0
def read_file(fileName,rangeEnd, columnName, columnYear, collabGraph, sumOfWorks, professorsDict):
    professors_papers = openpyxl.load_workbook(os.path.join(fileName))
    papersSheet = professors_papers.active
    print(fileName + ' file reading...')
    for i in range(2, rangeEnd):
        authors = cyrtranslit.to_latin(papersSheet[columnName + str(i)].value)
        try:
            year = int(papersSheet[columnYear + str(i)].value)
        except TypeError:
            year = 0
        except ValueError:
            year = 0
        if year < 2000 or year > 2016:
            continue
        removableChars = '{}"'
        for char in removableChars:
            authors = authors.replace(char,'')
        authors = authors.split(',')
        if ('Sanja Delčev' in authors):
            print(authors)
        authorsCopy = [i for i in authors]
        for author in authors:
            if not (author in professorsDict):
                authorsCopy.remove(author)
        authors = authorsCopy

        for i in range(0, len(authors)):
            sumOfWorks[professorsDict[authors[i]]['id']] += 1
            for j in range(i + 1, len(authors)):
                m = professorsDict[authors[i]]['id']
                n = professorsDict[authors[j]]['id']
                collabGraph[m][n] = collabGraph[m][n] + 1
Exemplo n.º 24
0
    def test_numerical_characters(self):
        ''' Numerical characters should remain the same.
        '''
        transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars,
                                                              lang_code='tj')

        self.assertEqual(transliterated_numerical_chars, numerical_chars)
Exemplo n.º 25
0
def delete_badge(user, competition):
    try:
        os.remove(settings.BASE_DIR + '/media/badges/{}/{}_badge.png'.format(
            competition.id,
            cyrtranslit.to_latin(remove_ukrainian(user.get_full_name()), 'ru'))
                  )
    except:
        pass
Exemplo n.º 26
0
    def test_alphabet_transliteration(self):
        ''' Transliteration of entire Serbian cyrillic alphabet to latin.
        '''
        transliterated_serbian_alphabet = cyrtranslit.to_latin(
            serbian_alphabet_cyrillic)

        self.assertEqual(transliterated_serbian_alphabet,
                         serbian_alphabet_latin)
def clean_lineinst(line):
    pat = re.compile("(\d+),(.*),(.*)")
    pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+')

    uid, uname, fname = pat.match(line).groups()
    fname = re.sub(pat_word, '', fname).strip().lower()

    fname = cyrtranslit.to_latin(fname, 'ru').replace("'", '')
    return (uid, uname, fname)
Exemplo n.º 28
0
def get_translated_title(language_abbreviation, title):
    result = [title]
    translated_title = translate(language_abbreviation, title)
    result.append(translated_title["res"])
    if language_abbreviation == "en":
        transliterated_title = cyrtranslit.to_latin(translated_title["res"])
        if transliterated_title != title:
            result.append(transliterated_title)
    return result
Exemplo n.º 29
0
    def data_importer_of_municipality_sombor(self):

        # Remove previous records in database, if there is any for this municipality
        db.opstine.remove({"opstina.latinica": "Sombor", "tipPodataka.slug": "rashodi"})

        # Read data from CSV file and assign those data to a data handler object
        data_handler = reader(open("data/rashodi/sombor.csv", "r"), delimiter=",")

        program = ''
        subprogram = ''
        # use program categories for better data categorizing
        program_categories = utils.sombor_programs()
        # Iterate throughout every row in data handler
        for index, row in enumerate(data_handler):
            if index > 4:
                # init program
                if row[2] not in ["", " "]:
                    if row[2].strip() in program_categories:
                        program = row[2].strip()

                    if program != "" and row[2].strip() in program_categories[program]:
                        subprogram = row[2].strip()

                if row[1] not in ["", " "] and program not in ["", " "] and subprogram not in ["", " "] and len(row[1]) < 4:
                    json_doc = self.build_mongo_document_structure_for_prihodi_rashodi(
                        "Сомбор",
                        row[1],
                        row[2].replace("*", ""),
                        row[3],
                        row[4],
                        row[5],
                        row[6],
                        None
                    )

                    # Add program and subprogram after building the main mongo document
                    json_doc["program"] = {}
                    json_doc["program"]["cirilica"] = program.strip()
                    json_doc["program"]["latinica"] = cyrtranslit.to_latin(program, "sr")
                    json_doc["potProgram"] = {}
                    json_doc["potProgram"]["cirilica"] = subprogram.strip()
                    json_doc["potProgram"]["latinica"] = cyrtranslit.to_latin(subprogram, "sr")
                    db.opstine.insert(json_doc)
                    print "Opstine: %s - Program: %s %s" % ("Сомбор", program, row[1])
Exemplo n.º 30
0
 def c2l():
     m = Frame.m
     txt = m.get()
     root29 = tk.Tk()
     root29.title('Result(Cyrillic2Latin)')
     result = cyrtranslit.to_latin(txt, 'ru')
     print(result)
     pyperclip.copy(result)
     label29 = tk.Label(root29, text=result, font=16)
     label29.pack(fill="x")
Exemplo n.º 31
0
def get_translated_keywords(language_abbreviation, keywords):
    """
    Merge keywords from the provided language and translated one.
    Runs transliteration - from cyrillic to latin in case of Serbian.


    :param language_abbreviation: eg. 'en', 'sr'.
    :param keywords: eg. ["epidemic", "flu"].
    :return: eg. ["epidemic", "flu", "епидемија", "грипа"].
    """
    keywords_data = []
    for keyword in keywords.split(","):
        translated_keyword = translate(language_abbreviation, keyword)
        keywords_data.append(keyword)
        keywords_data.append(translated_keyword["res"])
        if language_abbreviation == "en":
            keywords_data.append(
                cyrtranslit.to_latin(translated_keyword["res"]))
        else:
            keywords_data.append(cyrtranslit.to_latin(keyword))
    return keywords_data
Exemplo n.º 32
0
def translate_word(word, lang, src='en'):
    if src == 'auto':
        translated_word = translator.translate(word, dest=lang).text
    # src is code of language from which word is being translated (english)
    else:
        translated_word = translator.translate(word, dest=lang, src='en').text

    if lang == 'sr':  # if translating into serbian
        # also adding latin version of word using cytranslit.
        translated_word_lat = cyrtranslit.to_latin(translated_word)
        return translated_word, translated_word_lat
    else:
        return translated_word
def clean_linevk(line):
    pat = re.compile("(\d+),(.*),(.*),(.*)")
    pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+')

    try:
        uid, uname, name1, name2 = pat.match(line).groups()
        name1 = re.sub(pat_word, '', name1).strip().lower()
        name2 = re.sub(pat_word, '', name2).strip().lower()
        fname = name1 + ' ' + name2
    except AttributeError:
        print(line)
    fname = cyrtranslit.to_latin(fname, 'ru').replace("'", '')
    return (uid, uname, fname)
Exemplo n.º 34
0
def create_file_name(fio_ru):
    last_name_ru = cy.to_latin(fio_ru, 'ru').replace('J', 'Y').replace("'", "").replace('j', 'y')
    last_name_ru = last_name_ru.split()
    first_letter_in_fn = last_name_ru[1][0]
    if len(last_name_ru) == 3:  #
        try:
            first_letter_in_fan = '_' + last_name_ru[2][0]
        except:
            pass
    else:
        first_letter_in_fan = ''
    last_name_ru = str(last_name_ru[0]).capitalize() + '_' + first_letter_in_fn + first_letter_in_fan
    return last_name_ru
Exemplo n.º 35
0
def translite(file_in, file_out, ru=''):
    text_file = open(file_out, "w")

    with open(file_in) as f:
        if ru:
            for line in f:
                new_line = cyrtranslit.to_latin(line, 'ru')
                text_file.write(new_line)
        else:
            for line in f:
                new_line = cyrtranslit.to_cyrillic(line)
                text_file.write(new_line)
    text_file.close()
Exemplo n.º 36
0
    def import_data_parliament_2007(self):
        election_type = 'parlamentarni'
        year = 2007
        self.prep_import(election_type, year, None, None)
        file_path = self.get_data_file_path(election_type, year, None, None)

        row_count = 0
        docs = []
        candidates_or_parties = {}
        parent_territory = ''

        with open(file_path, 'rb') as f:
            reader = csv.reader(f)

            for row in tqdm(reader):
                doc = {}

                # Get all the candidates/parties
                if row_count == 0:
                    for i in range(12, len(row)):
                        candidates_or_parties[str(i)] = row[i].replace('\n', '')
                else:
                    territory = row[2].strip()
                    territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)
                    polling_station_num = int(row[3].strip())
                    polling_station_address = row[4].strip()
                    ballots_received_count = int(row[5].strip())
                    unused_ballots_count = int(row[6].strip())
                    number_of_voters_registered=int(row[7].strip())
                    voters_who_voted_count = int(row[8].strip())
                    ballots_in_ballot_box_count = int(row[9].strip())
                    invalid_ballots_count = int(row[10].strip())
                    valid_ballots_count = int(row[11].strip())


                    doc['brojPrimljeniGlasackiListica'] = ballots_received_count
                    doc['brojNeupotrebljenihGlasackiListica']=unused_ballots_count
                    doc['brojUpisanihBiracaUBirackiSpisak'] = number_of_voters_registered
                    doc['nevazeciGlasackiListici']= invalid_ballots_count
                    doc['biraciKojiSuGlasali'] = {}
                    doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count
                    # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent
                    doc['brojGlasackihListicaUKutiji'] = {}
                    doc['brojGlasackihListicaUKutiji']['broj'] = ballots_in_ballot_box_count
                    doc['vazeciGlasackiListici'] = {}
                    doc['vazeciGlasackiListici']['broj'] = valid_ballots_count

                    doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                    doc['godina'] = int(year)
                    # Some rows consist of territory grouping.
                    # We need to track those.
                    if cyrtranslit.to_latin(territory, 'sr').isupper():
                        doc['instanca'] = 1

                    elif 'okrug' in territory_slug \
                            or territory_slug in ['grad-beograd', 'inostranstvo'] \
                            or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '':
                        doc['instanca'] = 2
                        parent_territory = territory

                    elif polling_station_num is '':
                        doc['instanca'] = 3
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'),
                                                              to_lower=True)

                    elif polling_station_num is not '':
                        doc['instanca'] = 4
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'),
                                                              to_lower=True)
                        doc['brojBirackogMesta'] = polling_station_num
                        doc['adresaBirackogMesta'] = polling_station_address
                    total_votes=0
                    udeo=0
                    for j in range(12, len(row)):
                        doc['rezultat'] = {}
                        doc['rezultat']['glasova'] = int(row[j])
                        if int(row[j]) != 0:
                            total_votes += int(row[j])
                            udeo = (float(int(row[j])) / total_votes) * 100

                        else:
                            udeo = 0.0
                        doc['rezultat']['udeo'] = udeo
                        doc['teritorija'] = territory
                        doc['teritorijaSlug'] = territory_slug
                        doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                        doc['godina'] = int(year)

                        doc['izbornaLista'] = candidates_or_parties[str(j)]

                        doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                          to_lower=True)

                        # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista'])
                        docs.append(doc.copy())

                        if len(docs) % 1000 == 0:
                            db[collection].insert(docs)
                            docs = []

                row_count += 1

        # Insert remaining documents
        if len(docs) > 0:
            db[collection].insert(docs)
Exemplo n.º 37
0
    def import_data_rest(self, election_type, year, month=None, rnd=None):

        self.prep_import(election_type, year, month, rnd)

        file_path = self.get_data_file_path(election_type, year, month, rnd)

        row_count = 0
        docs = []
        candidates_or_parties = {}
        parent_territory = ''

        with open(file_path, 'rb') as f:
            reader = csv.reader(f)

            for row in tqdm(reader):
                doc = {}

                # Get all the candidates/parties
                if row_count == 0:
                    if int(year) == 2004 and election_type == "predsjednicki":
                        for i in xrange(11, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()

                    if int(year) == 2008 and election_type == "predsjednicki":
                        for i in xrange(8, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()

                    if int(year) == 2003 and election_type in ["predsjednicki", "parlamentarni"]:
                        for i in xrange(6, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()
                    elif int(year) == 2002 and election_type == "predsjednicki":
                        for i in xrange(7, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()
                    else:
                        for i in xrange(13, len(row), 2):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()

                elif row_count == 1:
                    pass

                else:

                    if int(year)==2004 and election_type=="predsjednicki":
                        territory = row[1].strip()
                        territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)
                        polling_station_num = int(row[2].strip())
                        polling_station_address = row[3].strip()
                        ballots_received_count = int(row[4].strip())
                        unused_ballots_count = int(row[5].strip())
                        registered_voters_count = int(row[6].strip())
                        voters_who_voted_count = int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        valid_ballots_count = int(row[10].strip())
                        print row_count

                    else:
                        print row_count
                        territory = row[0].strip()
                        territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)
                        polling_station_num = int(row[1].strip()) if row[1].strip() is not '' else row[1].strip()
                        polling_station_address = row[2].strip()

                        registered_voters_count = int(row[3].strip())

                    if int(year) == 2012 and election_type == "predsjednicki":
                        ballots_received_count = int(row[6].strip())
                        unused_ballots_count = int(row[7].strip())
                        voters_who_voted_count = int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        invalid_ballots_percent = float(row[10].strip())
                        valid_ballots_count = int(row[11].strip())
                        valid_ballots_percent = float(row[12].strip())

                    if int(year) == 2012 and election_type == "parlamentarni":
                        voters_who_voted_count = int(row[4].strip())
                        voters_who_voted_percent = float(row[5].strip())
                        ballots_received_count = int(row[6].strip())
                        unused_ballots_count = int(row[7].strip())
                        ballots_in_ballot_box_count=int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        invalid_ballots_percent = float(row[10].strip())
                        valid_ballots_count = int(row[11].strip())
                        valid_ballots_percent = float(row[12].strip())


                    if int(year)==2008 and election_type=="predsjednicki":
                        voters_who_voted_count = int(row[6].strip())
                        voters_who_voted_percent=float(row[7].strip())
                    if int(year) not in [2008, 2012]  and election_type != "predsjednicki":
                        voters_who_voted_count = int(row[4].strip())

                    if int(year) == 2003 and election_type in["predsjednicki","parlamentarni"]:
                        voters_who_voted_count = int(row[4].strip())
                        total_voter_turn_out = float(row[5].strip())



                    if int(year) == 2002 and election_type == "predsjednicki":
                        print row_count
                        voters_who_voted_count = int(row[4].strip())
                        total_voter_turn_out = float(row[5].strip())


                    if int(year) not in [2002, 2003,2004] and election_type not in ["predsjednicki", "parlamentarni"]:
                        voters_who_voted_percent = float(row[5].strip())
                        ballots_received_count = int(row[6].strip())
                        unused_ballots_count = int(row[7].strip())
                        ballots_in_ballot_box_count = int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        invalid_ballots_percent = float(row[10].strip())
                        valid_ballots_count = int(row[11].strip())
                        valid_ballots_percent = float(row[12].strip())


                    doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count
                    doc['biraciKojiSuGlasali'] = {}

                    doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count

                    if int(year) in [2002, 2003] and election_type in ["predsjednicki", "parlamentarni"]:
                        doc['odzivBiraca']=total_voter_turn_out

                    if int(year) not in [2002, 2003] and election_type not in ["predsjednicki", "parlamentarni"]:
                        doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent
                        doc['brojPrimljenihGlasackihListica'] = ballots_received_count
                        doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count
                        if int(year) not in [2012, 2004] and election_type!="predsjednicki":
                            doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count
                        doc['brojGlasackihListicaUKutiji'] = {}
                        doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count
                        if int(year)!=2004 and election_type!="predsjednicki":
                            doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent
                        doc['vazeciGlasackiListici'] = {}
                        doc['vazeciGlasackiListici']['broj'] = valid_ballots_count
                        if int(year) != 2004 and election_type != "predsjednicki":
                            doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent
                    # Some rows consist of territory grouping.
                    # We need to track those.
                    if cyrtranslit.to_latin(territory, 'sr').isupper():
                        doc['instanca'] = 1

                    elif 'okrug' in territory_slug\
                            or territory_slug in ['grad-beograd', 'inostranstvo']\
                            or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '':
                        doc['instanca'] = 2
                        parent_territory = territory

                    elif polling_station_num is '':
                        doc['instanca'] = 3
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True)

                    elif polling_station_num is not '':
                        doc['instanca'] = 4
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True)
                        doc['brojBirackogMesta'] = polling_station_num
                        doc['adresaBirackogMesta'] = polling_station_address

                    if int(year)==2003 and election_type in ["parlamentarni"]:
                        total_votes=0
                        udeo=0
                        for j in xrange(6, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}
                            doc['rezultat']['glasova'] = int(row[j])


                            if int(row[j]) != 0:
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100

                            else:
                                udeo = 0.0

                            doc['rezultat']['udeo'] =float(udeo)


                            doc['izbornaLista'] = candidates_or_parties[str(j)]
                            doc['izbornaListaSlug'] = slugify(
                            cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []
                    elif int(year) == 2002 and election_type == "predsjednicki":
                        total_votes=0
                        udeo=0
                        for j in xrange(7, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}


                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                print int(row[j])
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100

                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election

                            month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                            rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                            doc['mesec'] = month_cyr
                            doc['krug'] = rnd_cyr
                            doc['kandidat'] = candidates_or_parties[str(j)].title()
                            doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                              to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []
                    elif int(year) == 2003 and election_type == "predsjednicki":
                        total_votes=0
                        udeo=0
                        for j in xrange(6, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}


                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                print int(row[j])
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100

                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election

                            month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                            rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                            doc['mesec'] = month_cyr
                            doc['krug'] = rnd_cyr
                            doc['kandidat'] = candidates_or_parties[str(j)].title()
                            doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                              to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []

                    elif int(year) == 2004 and election_type == "predsjednicki":
                        total_votes=0
                        udeo=0
                        for j in xrange(11, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}

                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100
                                print udeo
                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election

                            month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                            rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                            doc['mesec'] = month_cyr
                            doc['krug'] = rnd_cyr
                            doc['kandidat'] = candidates_or_parties[str(j)].title()
                            doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                          to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []

                    else:
                        total_votes=0
                        udeo=0
                        for j in xrange(13, len(row), 2):
                            # Set generic values
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}
                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100
                                print udeo
                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election
                            if election_type == 'predsjednicki':
                                month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                                rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                                doc['mesec'] = month_cyr
                                doc['krug'] = rnd_cyr
                                doc['kandidat'] = candidates_or_parties[str(j)].title()
                                doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True)

                            else:
                                doc['izbornaLista'] = candidates_or_parties[str(j)]
                                doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []

                row_count += 1

        # Insert remaining documents
        if len(docs) > 0:
            db[collection].insert(docs)
    def test_latin_alphabet_characters(self):
        ''' Alphabet characters should remain the same.
        '''
        transliterated_alphabet_chars = cyrtranslit.to_latin(alphabet_chars)

        self.assertEqual(transliterated_alphabet_chars, alphabet_chars)
    def test_numerical_characters(self):
        ''' Numerical characters should remain the same.
        '''
        transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars)

        self.assertEqual(transliterated_numerical_chars, numerical_chars)
    def test_special_diacritic_characters(self):
        ''' Diacritic characters should remain the same.
        '''
        transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars)

        self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
    def test_special_characters(self):
        ''' Special characters should remain the same.
        '''
        transliterated_special_chars = cyrtranslit.to_latin(special_chars)

        self.assertEqual(transliterated_special_chars, special_chars)
    def test_alphabet_transliteration(self):
        ''' Transliteration of entire Serbian cyrillic alphabet to latin.
        '''
        transliterated_serbian_alphabet = cyrtranslit.to_latin(serbian_alphabet_cyrillic)

        self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_latin)
    def test_alphabet_transliteration_cyrillic_to_latin(self):
        ''' Transliteration of entire cyrillic alphabet to latin.
        '''
        transliterated_alphabet = cyrtranslit.to_latin(russian_alphabet_cyrillic, lang_code='ru')

        self.assertEqual(transliterated_alphabet, russian_alphabet_latin)
Exemplo n.º 44
0
    def import_data_parliament_2016(self):
        election_type = 'parlamentarni'
        year = 2016
        self.prep_import(election_type, year, None, None)
        file_path = self.get_data_file_path(election_type, year, None, None)
        row_count = 0
        docs = []
        candidates_or_parties = {}
        with open(file_path, 'rb') as f:
            reader = csv.reader(f)

            for row in tqdm(reader):
                doc = {}

                # Get all the candidates/parties
                if row_count == 0:
                    for i in range(14, len(row)):
                        candidates_or_parties[str(i)] = row[i].replace('\n', '')

                elif row[7].strip() is not '':  # FIXME: we do this because row 8,350 is blank.
                    parent_territory = row[1].strip()
                    parent_territory_slug = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True)

                    territory = row[3].strip()
                    territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)

                    polling_station_num = int(row[4].strip())
                    polling_station_address = row[5].strip()
                    coordinates = row[6].strip().split(',')

                    registered_voters_count = int(row[7].strip())
                    ballots_received_count = int(row[8].strip())
                    unused_ballots_count = int(row[9].strip())

                    voters_who_voted_count = int(row[10].strip())
                    # voters_who_voted_percent = None

                    ballots_in_ballot_box_count = int(row[11].strip())

                    invalid_ballots_count = int(row[12].strip())
                    # invalid_ballots_percent = None

                    valid_ballots_count = int(row[13].strip())
                    # valid_ballots_percent = None

                    # Set election type and year
                    doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                    doc['godina'] = int(year)

                    # Set generic location values
                    doc['teritorija'] = territory
                    doc['teritorijaSlug'] = territory_slug

                    doc['parentTeritorija'] = parent_territory
                    doc['parentTeritorijaSlug'] = parent_territory_slug

                    doc['brojBirackogMesta'] = polling_station_num
                    doc['adresaBirackogMesta'] = polling_station_address

                    # FIXME: at least one coordinate is missing (row 1481)
                    if len(coordinates) == 2:
                        doc['koordinateBirackomMestu'] = {}
                        doc['koordinateBirackomMestu']['latituda'] = float(coordinates[0].strip())
                        doc['koordinateBirackomMestu']['longituda'] = float(coordinates[1].strip())

                    # Set generic ballot values
                    doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count

                    doc['biraciKojiSuGlasali'] = {}
                    doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count
                    # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent

                    doc['brojPrimljenihGlasackihListica'] = ballots_received_count
                    doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count
                    doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count

                    doc['brojGlasackihListicaUKutiji'] = {}
                    doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count
                    # doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent

                    doc['vazeciGlasackiListici'] = {}
                    doc['vazeciGlasackiListici']['broj'] = valid_ballots_count
                    # doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent

                    # For this year, we don't have grouped territories we are importing.
                    # So every document is at the smallest unit of territory
                    doc['instanca'] = 4

                    # print '---------'
                    total_votes=0
                    udeo=0
                    for j in range(14, len(row)):
                        doc['rezultat'] = {}
                        doc['rezultat']['glasova'] = int(row[j])
                        if int(row[j]) != 0:
                            total_votes += int(row[j])
                            udeo = (float(int(row[j])) / total_votes) * 100

                        else:
                            udeo = 0.0
                        doc['rezultat']['udeo'] = udeo

                        doc['izbornaLista'] = candidates_or_parties[str(j)]
                        doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                          to_lower=True)

                        # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista'])
                        docs.append(doc.copy())

                        if len(docs) % 1000 == 0:
                            db[collection].insert(docs)
                            docs = []

                row_count += 1

        # Insert remaining documents
        if len(docs) > 0:
            db[collection].insert(docs)
    def build_mongo_document_structure_for_budzets(self, razdeo, glava, program, funkcija, programska_aktivnost_projekat, ekonomska_klasifikacija, opis, ukupna_sredstva):
        """

        :param razdeo:
        :param glava:
        :param program:
        :param funkcija:
        :param programska_aktivnost_projekat:
        :param ekonomska_klasifikacija:
        :param opis:
        :param ukupna_sredstva: the total for economic classification
        :return:
        """
        json_doc = {
            "razdeo": {
                "broj": razdeo,
                "opis": {
                    "cirilica": opis,
                    "latinica": cyrtranslit.to_latin(opis, "sr")
                }
            },
            "glava": {
                "broj": glava,
                "opis": {
                    "cirilica": opis,
                    "latinica": cyrtranslit.to_latin(opis, "sr")
                }
            },
            "program": {
                "broj": program,
                "opis": {
                    "cirilica": opis,
                    "latinica": cyrtranslit.to_latin(opis, "sr")
                }
            },
            "funkcija": {
                "broj": funkcija,
                "opis": {
                    "cirilica": opis,
                    "latinica": cyrtranslit.to_latin(opis, "sr")
                }
            },
            "programskaAktivnostProjekat": {
                "broj": programska_aktivnost_projekat,
                "opis": {
                    "cirilica": opis,
                    "latinica": cyrtranslit.to_latin(opis, "sr")
                }
            },
            "ekonomskaKlasifikacija": {
                "broj": ekonomska_klasifikacija,
                "opis": {
                    "cirilica": opis,
                    "latinica": cyrtranslit.to_latin(opis, "sr")

                },
                "ukupna_sredstva": self.convert_to_float(ukupna_sredstva.replace(",", ""))
            }
        }

        return json_doc
Exemplo n.º 46
0
    def data_importer_of_municipality_vranje(self):
        db.opstine.remove({"opstina.latinica": "Vranje", "tipPodataka.slug": "rashodi"})
        # init parent categories JSON
        parent_categories = utils.parent_categories_for_vranje()
        program_categories = utils.program_categories_for_vranje()

        # Read data from vranje csv file
        data_handler = reader(open("data/rashodi/vranje.csv", "r"), delimiter=",")
        program = ""
        subprogram = ""
        for index, row in enumerate(data_handler):
            if index > 0:
                if index < 48 and len(row[1]) > 2:
                    if row[1] != "541":
                        parent_handler = parent_categories[row[1][0:2]]
                    else:
                        parent_handler = parent_categories["51"]
                    json_doc = self.build_mongo_document_structure_for_prihodi_rashodi(
                        "Врање",
                        row[1],
                        row[2],
                        row[3],
                        row[4],
                        row[5],
                        row[6],
                        None,
                        parent_handler,
                        row[1][0:2]
                    )
                    db.opstine.insert(json_doc)
                    print "Opstine: %s - Kategorija Roditelj: %s - Opis: %s" % ("Врање", parent_handler, row[1])

                elif index > 48:
                    # init program
                    if row[2] not in ["", " "]:
                        if row[2].strip() in program_categories:
                            program = row[2].strip()

                        if program != "" and row[2].strip() in program_categories[program]:
                            subprogram = row[2].strip()

                    if row[1] not in ["", " "] and program not in ["", " "] and subprogram not in ["", " "]:
                        json_doc = self.build_mongo_document_structure_for_prihodi_rashodi(
                            "Врање",
                            row[1],
                            row[2],
                            row[3],
                            row[4],
                            row[5],
                            row[6],
                            None
                        )

                        json_doc["program"] = {}
                        json_doc["program"]["cirilica"] = program.strip()
                        json_doc["program"]["latinica"] = cyrtranslit.to_latin(program, "sr")
                        json_doc["potProgram"] = {}
                        json_doc["potProgram"]["cirilica"] = subprogram.strip()
                        json_doc["potProgram"]["latinica"] = cyrtranslit.to_latin(subprogram, "sr")
                        db.opstine.insert(json_doc)
                        print "Opstine: %s - Program: %s %s" % ("Врање", program, row[1])
    def build_mongo_document_structure_for_prihodi_rashodi(self, municipality, class_number, opis, prihodi_vudzeta, sopstveni_prihodi, ostali, ukupno,  kategorija_roditelj=None, roditelj_broj=None):
        """

        :param municipality:
        :param class_number:
        :param opis:
        :param prihodi_vudzeta:
        :param sopstveni_prihodi:
        :param donacije:
        :param ostali:
        :param ukupno:
        :param kategorija_roditelj:
        :param roditelj_broj:
        :return:
        """
        if municipality in ["Сомбор", "Звездара"]:
            # In this municipality we have values only for column ukupno (total value)
            # That's why we need to import, instead of manually calculating manually
            prihodi_vudzeta = 0
            sopstveni_prihodi = 0
            ostali = 0
            ukupno = self.convert_to_float(ukupno.replace(',', ''))
        elif municipality in ["Краљево"]:
            # In this municipality we have values only for column ukupno (total value)
            # That's why we need to import, instead of manually calculating manually
            prihodi_vudzeta = 0
            sopstveni_prihodi = 0
            ostali = 0
            ukupno = self.convert_to_float(ukupno.replace(',', '').replace('.', '')[:-2])
        elif municipality in ["Нови Београд"]:
            # In this municipality we have values only for column ukupno (total value)
            # That's why we need to import, instead of manually calculating manually
            prihodi_vudzeta = 0
            sopstveni_prihodi = 0
            ostali = 0
            ukupno = self.convert_to_float(ukupno.replace('.', ''))
        else:
            prihodi_vudzeta = self.convert_to_float(prihodi_vudzeta.replace(',', ''))
            sopstveni_prihodi = self.convert_to_float(sopstveni_prihodi.replace(',', ''))
            ostali = self.convert_to_float(ostali.replace(',', ''))
            ukupno = prihodi_vudzeta + sopstveni_prihodi + ostali


        # Let's build mongo document structure
        json_doc = {
            "tipPodataka": {
                "vrednost": "Prihodi",
                "slug": "prihodi",
            },
            "godina": 2015,
             "kategorijaRoditelj": {
                "opis": {
                    "cirilica": "Скупштина општине",
                    "latinica": "Skupština Opštine",
                },
                 "broj": 0
            },
            "opstina": {
                "cirilica": municipality,
                "latinica": cyrtranslit.to_latin(municipality, "sr"),
                "slug": slugify(municipality, to_lower=True)
            },
            "klasifikacija": {
                "opis": {
                    "cirilica": opis.strip(),
                    "latinica": cyrtranslit.to_latin(opis.strip(), "sr")
                }
            },
            "prihodiBudzeta": prihodi_vudzeta,
            "sopstveniPrihodi": sopstveni_prihodi,
            "ostali": ostali,
            "ukupno": ukupno
        }

        if kategorija_roditelj is not None:
            json_doc["kategorijaRoditelj"]["opis"]["cirilica"] = kategorija_roditelj.strip()
            json_doc["kategorijaRoditelj"]["opis"]["latinica"] = cyrtranslit.to_latin(kategorija_roditelj, "sr")
            json_doc["kategorijaRoditelj"]["broj"] = roditelj_broj


        json_doc["klasifikacija"]["broj"] = class_number.strip()

        return json_doc
Exemplo n.º 48
0
    def import_data(self, election_type, year, month=None, rnd=None):

        self.prep_import(election_type, year, month, rnd)

        file_path = self.get_data_file_path(election_type, year, month, rnd)

        e = xml.etree.ElementTree.parse(file_path).getroot()

        results = {}
        docs = []
        for result in e.findall('Result'):
            territory = result.attrib[u'Територија'].strip()
            data_type = result.attrib[u'Врста_податка'].strip()
            candidate = result.attrib[u'Кандидат'].strip() if election_type == 'predsjednicki' else result.attrib[u'Изборна_листа'].strip()

            # We have two entries per territory. One for share of votes (in percentage) and one for number of votes.
            # We want to save both numbers in the same document
            # To achieve this, we keep track of created documents per territory
            if territory not in results:
                results[territory] = {}

            if candidate not in results[territory]:
                results[territory][candidate] = {
                    'teritorija': territory,
                    'teritorijaSlug': slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'), to_lower=True),
                    'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'),
                    'godina': int(year),
                    'rezultat': {
                        'udeo': None,
                        'glasova': None
                    }
                }

                # All values with capital letters are grouped regions
                # we need to mark them so that we don't count votes more than once
                territory_slug = slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'))

                if territory_slug.isupper() and ('okrug' in territory_slug.lower() or territory_slug.lower() == 'grad-beograd') :
                    results[territory][candidate]['instanca'] = 2

                elif territory_slug.isupper():
                    results[territory][candidate]['instanca'] = 1

                else:
                    results[territory][candidate]['instanca'] = 3

                # Set remaining values depending on whether is is a presidential or parliamentary election
                if election_type == 'predsjednicki':
                    month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                    rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                    results[territory][candidate]['mesec'] = month_cyr
                    results[territory][candidate]['krug'] = rnd_cyr
                    results[territory][candidate]['kandidat'] = candidate.title()
                    results[territory][candidate]['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True)

                else:
                    results[territory][candidate]['izbornaLista'] = candidate
                    results[territory][candidate]['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True)

            # Удео броја гласова које је добила листа у укупном броју гласова, %
            if '%' in data_type:
                results[territory][candidate]['rezultat']['udeo'] = float(result.text.replace(',', '.'))

            # Број гласова које је добила листа
            else:
                results[territory][candidate]['rezultat']['glasova'] = int(result.text)


            if results[territory][candidate]['rezultat']['udeo'] is not None and results[territory][candidate]['rezultat']['glasova'] is not None:
                docs.append(results[territory][candidate])

        # Insert documents
        db['izbori'].insert(docs)