示例#1
0
def connect_anahashes_to_wordforms(session, anahashes, df, batch_size=50000):
    """
    Create the relation between wordforms and anahashes in the database.

    Given `anahashes`, a dataframe with wordforms and corresponding anahashes,
    create the relations between the two in the wordforms and anahashes tables
    by setting the anahash_id foreign key in the wordforms table.
    """
    LOGGER.info('Connecting anahashes to wordforms.')

    LOGGER.debug('Getting wordform/anahash_id pairs.')
    with get_temp_file() as anahash_to_wf_file:
        total_lines_written = write_json_lines(
            anahash_to_wf_file, get_anahashes(session, anahashes, df))

        update_statement = Wordform.__table__.update(). \
            where(Wordform.wordform_id == bindparam('wf_id')). \
            values(anahash_id=bindparam('a_id'))

        LOGGER.debug('Adding the connections wordform -> anahash_id.')
        sql_query_batches(session, update_statement,
                          read_json_lines(anahash_to_wf_file),
                          total_lines_written, batch_size)

    LOGGER.info('Added the anahash of %s wordforms.', total_lines_written)

    return total_lines_written
示例#2
0
def add_morphological_paradigms(session, in_file):
    """
    Add morphological paradigms to database from CSV file.
    """
    data = pd.read_csv(in_file,
                       sep='\t',
                       index_col=False,
                       names=[
                           'wordform', 'corpus_freq', 'component_codes',
                           'human_readable_c_code', 'first_year', 'last_year',
                           'dict_ids', 'pos_tags', 'int_ids'
                       ])
    # drop first row (contains empty wordform)
    data = data.drop([0])

    # store wordforms for in database
    wfs = data[['wordform']].copy()
    bulk_add_wordforms(session, wfs)

    # get the morphological variants from the pandas dataframe
    LOGGER.info('extracting morphological variants')
    morph_paradigms_per_wordform = defaultdict(list)
    with tqdm(total=data.shape[0]) as pbar:
        for row in data.iterrows():
            codes = row[1]['component_codes'].split('#')
            wordform = row[1]['wordform']
            for code in codes:
                morph_paradigms_per_wordform[wordform].append(
                    split_component_code(code, wordform))
            pbar.update()

    LOGGER.info('Looking up wordform ids.')
    select_statement = select([Wordform
                               ]).where(Wordform.wordform.in_(wfs['wordform']))
    mapping = session.execute(select_statement).fetchall()

    LOGGER.info('Writing morphological variants to file.')
    with get_temp_file() as mp_file:
        total_lines_written = write_json_lines(
            mp_file, morph_iterator(morph_paradigms_per_wordform, mapping))
        LOGGER.info('Wrote %s morphological variants.', total_lines_written)
        LOGGER.info('Inserting morphological variants to the database.')
        sql_insert_batches(session,
                           MorphologicalParadigm,
                           read_json_lines(mp_file),
                           batch_size=50000)
示例#3
0
def test_read_and_write_json_lines(fs):
    objects = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}]

    fname = 'objects'

    f = open(fname, 'w')

    total = write_json_lines(f, objects)
    f.close()
    f = open(fname, 'r')

    assert os.path.exists(fname)
    assert total == len(objects)

    results = [o for o in read_json_lines(f)]
    f.close()

    assert objects == results
示例#4
0
def bulk_add_anahashes(session,
                       anahashes,
                       tqdm_factory=None,
                       batch_size=10000):
    """anahashes is pandas dataframe with the column wordform (index), anahash
    """
    LOGGER.info('Adding anahashes.')
    # Remove duplicate anahashes
    unique_hashes = anahashes.copy().drop_duplicates(subset='anahash')
    LOGGER.debug('The input data contains %s wordform/anahash pairs.',
                 anahashes.shape[0])
    LOGGER.debug('There are %s unique anahash values.', unique_hashes.shape[0])

    count_added = 0

    with get_temp_file() as anahashes_to_add_file:
        if tqdm_factory is not None:
            pbar = tqdm_factory(total=unique_hashes.shape[0])
        for chunk in chunk_df(unique_hashes, batch_size=batch_size):
            # Find out which anahashes are not yet in the database.
            ahs = set(list(chunk['anahash']))

            select_statement = select([Anahash
                                       ]).where(Anahash.anahash.in_(ahs))
            result = session.execute(select_statement).fetchall()

            existing_ahs = {row[1] for row in result}

            for non_existing_ah in ahs.difference(existing_ahs):
                anahashes_to_add_file.write(
                    json.dumps({'anahash': non_existing_ah}))
                anahashes_to_add_file.write('\n')
                count_added += 1
            if tqdm_factory is not None:
                pbar.update(chunk.shape[0])
        if tqdm_factory is not None:
            pbar.close()

        bulk_add_anahashes_core(session,
                                read_json_lines(anahashes_to_add_file))

    LOGGER.info('Added %s anahashes.', count_added)

    return count_added
示例#5
0
def test_read_and_write_json_lines_empty(fs):
    objects = []

    fname = 'objects'

    f = open(fname, 'w')

    write_json_lines(f, objects)

    f.close()
    f = open(fname, 'r')

    assert os.path.exists(fname)
    assert os.path.getsize(fname) == 0

    results = [o for o in read_json_lines(f)]
    f.close()

    assert objects == results
示例#6
0
def add_lexicon_with_links(session,
                           lexicon_name,
                           vocabulary,
                           wfs,
                           from_column,
                           to_column,
                           from_correct,
                           to_correct,
                           batch_size=50000,
                           preprocess_wfs=True,
                           to_add=None):
    """
    Add wordforms from a lexicon with links to the database.

    Lexica with links contain wordform pairs that are linked. The `wfs`
    dataframe must contain two columns: the `from_column` and the `to_column`,
    which contains the two words of each pair (per row). Using the arguments
    `from_correct` and `to_correct`, you can indicate whether the columns of
    this dataframe contain correct words or not (boolean). Typically, there
    are two types of linked lexica: True + True, meaning it links correct
    wordforms (e.g. morphological variants) or True + False, meaning it links
    correct wordforms to incorrect ones (e.g. a spelling correction list).
    """
    LOGGER.info('Adding lexicon with links between wordforms.')

    if to_add is None:
        to_add = []

    # Make a dataframe containing all wordforms in the lexicon
    wordforms = pd.DataFrame()
    wordforms['wordform'] = wfs[from_column].append(wfs[to_column],
                                                    ignore_index=True)
    wordforms = wordforms.drop_duplicates(subset='wordform')

    # Create the lexicon (with all the wordforms)
    lexicon = add_lexicon(session,
                          lexicon_name,
                          vocabulary,
                          wordforms,
                          preprocess_wfs=preprocess_wfs)

    wf_mapping = get_wf_mapping(session, lexicon_id=lexicon.lexicon_id)

    if preprocess_wfs:
        wfs = preprocess_wordforms(wfs, columns=[from_column, to_column])

    with get_temp_file() as wfl_file:
        LOGGER.debug(
            'Writing wordform links to add to (possibly unnamed) temporary file.'
        )

        with get_temp_file() as wfls_file:
            LOGGER.debug(
                'Writing wordform link sources to add to (possibly unnamed) temporary file.'
            )

            num_l, num_s = write_wf_links_data(session,
                                               wf_mapping,
                                               wfs,
                                               from_column,
                                               to_column,
                                               lexicon.lexicon_id,
                                               from_correct,
                                               to_correct,
                                               wfl_file,
                                               wfls_file,
                                               add_columns=to_add)

            LOGGER.info('Inserting %s wordform links.', num_l)
            sql_insert_batches(session,
                               WordformLink,
                               read_json_lines(wfl_file),
                               batch_size=batch_size)

            LOGGER.info('Inserting %s wordform link sources.', num_s)
            sql_insert_batches(session,
                               WordformLinkSource,
                               read_json_lines(wfls_file),
                               batch_size=batch_size)

    return lexicon
示例#7
0
def add_corpus_core(session,
                    corpus_matrix,
                    vectorizer,
                    corpus_name,
                    document_metadata=pd.DataFrame(),
                    batch_size=50000):
    """
    Add a corpus to the database.

    A corpus is a collection of documents, which is a collection of words.
    This function adds all words as wordforms to the database, records their
    "attestation" (the fact that they occur in a certain document and with what
    frequency), adds the documents they belong to, adds the corpus and adds the
    corpus ID to the documents.

    Inputs:
        session: SQLAlchemy session (e.g. from `dbutils.get_session`)
        corpus_matrix: the dense corpus term-document matrix, like from
                       `tokenize.terms_documents_matrix_ticcl_frequency`
        vectorizer: the terms in the term-document matrix, as given by
                    `tokenize.terms_documents_matrix_ticcl_frequency`
        corpus_name: the name of the corpus in the database
        document_metadata: see `ticclat_schema.Document` for all the possible
                           metadata. Make sure the index of this dataframe
                           matches with the document identifiers in the term-
                           document matrix, which can be easily achieved by
                           resetting the index for a Pandas dataframe.
        batch_size: batch handling of wordforms to avoid memory issues.
    """
    with get_temp_file() as wf_file:
        write_json_lines(wf_file, iterate_wf(vectorizer.vocabulary_))

        # Prepare the documents to be added to the database
        LOGGER.info('Creating document data')
        corpus_csr = scipy.sparse.csr_matrix(corpus_matrix)
        word_counts = corpus_csr.sum(axis=1)  # sum the rows

        wc_list = np.array(word_counts).flatten().tolist()

        document_metadata['word_count'] = wc_list

        # Determine which wordforms in the vocabulary need to be added to the
        # database
        LOGGER.info('Determine which wordforms need to be added')
        with get_temp_file() as wf_to_add_file:
            with tqdm(total=count_lines(wf_file)) as pbar:
                for chunk in chunk_json_lines(wf_file, batch_size=batch_size):
                    # Find out which wordwordforms are not yet in the database
                    wordforms = {wf['wordform'] for wf in chunk}
                    select_statement = select([Wordform]).where(
                        Wordform.wordform.in_(wordforms))
                    result = session.execute(select_statement).fetchall()

                    # wf: (id, wordform, anahash_id, wordform_lowercase)
                    existing_wfs = {wf[1] for wf in result}
                    for wordform in wordforms.difference(existing_wfs):
                        wf_to_add_file.write(
                            json.dumps({
                                'wordform': wordform,
                                'wordform_lowercase': wordform.lower()
                            }))
                        wf_to_add_file.write('\n')
                    pbar.update(batch_size)

            # Create the corpus (in a session) and get the ID
            LOGGER.info('Creating the corpus')
            corpus = Corpus(name=corpus_name)
            session.add(corpus)

            # add the documents using ORM, because we need to link them to the
            # corpus
            LOGGER.info('Adding the documents')
            for doc in document_metadata.to_dict(orient='records'):
                document_obj = Document(**doc)
                document_obj.document_corpora.append(corpus)
            session.flush()
            corpus_id = corpus.corpus_id

            # Insert the wordforms that need to be added using SQLAlchemy core (much
            # faster than using the ORM)
            LOGGER.info('Adding the wordforms')
            bulk_add_wordforms_core(session, read_json_lines(wf_to_add_file))

    LOGGER.info('Prepare adding the text attestations')
    # make a mapping from
    df = pd.DataFrame.from_dict(vectorizer.vocabulary_, orient='index')
    df = df.reset_index()

    LOGGER.info('\tGetting the wordform ids')
    wf_mapping = {}

    for chunk in chunk_df(df, batch_size=batch_size):
        to_select = list(chunk['index'])
        select_statement = select([Wordform
                                   ]).where(Wordform.wordform.in_(to_select))
        result = session.execute(select_statement).fetchall()
        for wordform in result:
            # wordform: (id, wordform, anahash_id, wordform_lowercase)
            wf_mapping[wordform[1]] = wordform[0]

    LOGGER.info('\tGetting the document ids')
    # get doc_ids
    select_statement = select([corpusId_x_documentId.join(Corpus).join(Document)]) \
        .where(Corpus.corpus_id == corpus_id).order_by(Document.document_id)
    result = session.execute(select_statement).fetchall()
    # row: (corpus_id, document_id, ...)
    doc_ids = [row[1] for row in result]

    LOGGER.info('\tReversing the mapping')
    # reverse mapping from wordform to id in the terms/document matrix
    word_from_tdmatrix_id = dict(
        zip(vectorizer.vocabulary_.values(), vectorizer.vocabulary_.keys()))

    LOGGER.info('\tGetting the text attestations')
    with get_temp_file() as ta_file:
        write_json_lines(
            ta_file,
            get_tas(corpus_matrix, doc_ids, wf_mapping, word_from_tdmatrix_id))

        LOGGER.info('Adding the text attestations')
        total = count_lines(ta_file)
        bulk_add_textattestations_core(session,
                                       read_json_lines(ta_file),
                                       total=total,
                                       batch_size=batch_size)
示例#8
0
def test_write_wf_links_data(dbsession, fs):
    wfl_file = 'wflinks'
    wfls_file = 'wflsources'

    name = 'linked test lexicon'

    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3', 'wf1s', 'wf2s', 'wf3s']

    lex = add_lexicon(dbsession, lexicon_name=name, vocabulary=True, wfs=wfs)

    wfs = pd.DataFrame()
    wfs['lemma'] = ['wf1', 'wf2', 'wf3']
    wfs['variant'] = ['wf1s', 'wf2s', 'wf3s']

    wfm = get_wf_mapping(dbsession, lexicon=lex)

    links_file = open(wfl_file, 'w')
    sources_file = open(wfls_file, 'w')

    num_l, num_s = write_wf_links_data(
        dbsession,
        wf_mapping=wfm,
        links_df=wfs,
        wf_from_name='lemma',
        wf_to_name='variant',
        lexicon_id=lex.lexicon_id,
        wf_from_correct=True,
        wf_to_correct=True,
        links_file=links_file,
        sources_file=sources_file,
    )

    links_file.close()
    sources_file.close()

    links_file = open(wfl_file, 'r')
    sources_file = open(wfls_file, 'r')

    assert num_l == 3 * 2
    assert num_s == 3 * 2

    wflinks = []
    for wf1, wf2 in zip(wfs['lemma'], wfs['variant']):
        wflinks.append({"wordform_from": wfm[wf1], "wordform_to": wfm[wf2]})
        wflinks.append({"wordform_from": wfm[wf2], "wordform_to": wfm[wf1]})

    wflsources = []
    for wfl in wflinks:
        wflsources.append({
            "wordform_from": wfl['wordform_from'],
            "wordform_to": wfl['wordform_to'],
            "lexicon_id": lex.lexicon_id,
            "wordform_from_correct": True,
            "wordform_to_correct": True
        })

    for wfls1, wfls2 in zip(read_json_lines(sources_file), wflsources):
        assert wfls1 == wfls2

    links_file.close()
    sources_file.close()