def get_offsets(tid):
    """

    :param tid:
    :return:
    """

    characters_to_delete_list = ['¯']

    document = get_ocr_by_tid(tid, return_bytearray=False)
    document = expand_contractions(document)
    for character in characters_to_delete_list:
        document = document.replace(character, '')

    document_split = re.findall(WORD_SPLIT_REGEX, document)
    text_sections = [
        document_split[i:i + SECTION_LENGTH]
        for i in range(0, len(document_split), SECTION_LENGTH)
    ]
    text_sections = [" ".join(text_section) for text_section in text_sections]

    no_deleted_characters = len(document) - len(" ".join(document_split))

    offsets = []
    for section in text_sections:
        offsets.append(document.find(section))
        print("\n")
        print(document.find(section))
        print(document[:2000])
        print(section[:2000])
        print(no_deleted_characters)
def add_section_offset():
    '''

    This really adds the number of tokens per document.

    :return:
    '''

    db = Database("TOB_FULL")
    con1, cur1 = db.connect()
    cur1.execute("SELECT id, tid, no_tokens FROM docs ORDER BY id ASC;")

    count = 0
    first_section_id_of_doc = 0
    no_t = None
    doc_id = None
    while True:
        row = cur1.fetchone()
        if not row: break
        count += 1



        if count % 100000 == 0:
            print(count, first_section_id_of_doc)

        if count < 100:

            doc_text = get_ocr_by_tid(row['tid'], return_bytearray=False).lower()
            doc_text = expand_contractions(doc_text)
            document_split = re.findall(WORD_SPLIT_REGEX, doc_text)
            text_sections = [document_split[i:i+200] for i in range(0, len(document_split), 200)]

            print(count, first_section_id_of_doc, row['no_tokens'], row['no_tokens']//200+1, len(text_sections))


        first_section_id_of_doc = first_section_id_of_doc + row['no_tokens'] // 200 + 1

        # prevent off by 1 error
        if row['no_tokens'] % 200 == 0:
            first_section_id_of_doc -= 1

        no_t = row['no_tokens']
        doc_id = row['id']

    print("final", doc_id, first_section_id_of_doc, no_t)
    print(first_section_id_of_doc - SECTION_COUNT)
Пример #3
0
def ngram_generator(document, ngram):
    '''
    Returns all ngrams of a given level for the document.
    Each ngram is a list of tokens.
    :param document:
    :param ngram:
    :param joined: Returns ngram either joined (=True) or as list (=False). e.g. doc = "This is a test", joined = "This is". list = ["This", "is"]
    :return:
    '''

    document = document.lower()
    document = expand_contractions(document)
    document_split = re.findall(WORD_SPLIT_REGEX, document)

    start = 0
    while(start + ngram <= len(document_split)):

        ngram_extract = document_split[start : start + ngram]
        yield ngram_extract
        start += 1
def add_no_tokens():

    db = Database('TOB_FULL')
    con1, cur1 = db.connect()
    con2, cur2 = db.connect()

    cur1.execute("SELECT tid FROM docs;")

    while True:
        row = cur1.fetchone()
        if not row: break

        tid = row['tid']
        doc_text = get_ocr_by_tid(tid, return_bytearray=False).lower()
        doc_text = expand_contractions(doc_text)
        no_tokens = len(re.findall(WORD_SPLIT_REGEX, doc_text))

        print(tid, no_tokens)
        cur2.execute('UPDATE docs SET no_tokens = {} WHERE tid = "{}";'.format(no_tokens, tid))


    con2.commit()
def create_utf_text_files():

    tar_folder = '/pcie/tobacco/'
    for filename in ['f-j.tar.gz', 'k-n.tar.gz', 'p-s.tar.gz', 't-z.tar.gz']:
        tar = tarfile.open(tar_folder + filename)

        count = 0
        for member in tar.getmembers():
            f=tar.extractfile(member)
            if not member.get_info()['name'].endswith('.ocr'):continue
            try:
                text = f.read().decode('cp1252', errors='ignore').lower()

                # 8/1/2017 why did I not add the contractions here before creating the folder initiallay????
                # 8/1/2017 added now
                text = expand_contractions(text)

                text = " ".join(WORD_SPLIT_REGEX.findall(text))
                tid = member.get_info()['name'][-12:-4]

                path = PATH_OCR_FILES + '{}/{}/{}/{}/'.format(tid[0], tid[1], tid[2], tid[3])
                if not os.path.exists(path):
                    os.makedirs(path)

                try:
                    file = codecs.open(path + tid + '.txt', "w", "utf-8")
                    file.write(text)
                    file.close()
                except FileNotFoundError:
                    print(member.get_info())

                count += 1
                if count % 10000 == 0:
                    print(filename, count)
            except AttributeError:
                pass
        tar.close()
def get_section_to_doc_and_offset_arr():
    """" Returns a dict that maps from docs to sections and offsets

    :return:
    """

    try:
        section_to_doc_and_offset_arr = np.load(
            PATH_TOKENIZED + 'section_to_doc_and_offset_arr.npy')
    except IOError:

        print(
            "section_to_doc_and_offset_dict not found. Creating a new one with section length = {}."
            .format(SECTION_LENGTH))

        db = Database("TOB_FULL")
        con1, cur1 = db.connect()
        cur1.execute("SELECT id, tid, no_tokens FROM docs ORDER BY id ASC;")

        section_to_doc_and_offset_arr = np.zeros((SECTION_COUNT, 3),
                                                 dtype=np.int32)
        doc_id_to_section_id_dict = get_doc_id_to_section_id_dict()

        characters_to_delete_list = ['¯']

        while True:
            row = cur1.fetchone()
            if not row: break

            doc_id = row['id']
            first_section_id_of_doc = doc_id_to_section_id_dict[row['id']][0]

            if doc_id % 10000 == 0:
                print(doc_id)

            # load doc and get offsets
            document = get_ocr_by_tid(row['tid'], return_bytearray=False)
            document = expand_contractions(document)
            doc_len_orig = len(document)
            for character in characters_to_delete_list:
                document = document.replace(character, '')

            document_split = re.findall(WORD_SPLIT_REGEX, document)
            text_sections = [
                document_split[i:i + SECTION_LENGTH]
                for i in range(0, len(document_split), SECTION_LENGTH)
            ]
            text_sections = [
                " ".join(text_section) for text_section in text_sections
            ]

            no_deleted_characters = doc_len_orig - len(
                " ".join(document_split))
            # if no_deleted_characters > 0:
            #     print( doc_id, row['tid'], no_deleted_characters, doc_len_orig)

            for section_idx, section in enumerate(text_sections):
                offset = document.find(section)
                if offset == -1:
                    if offset == -1:
                        offset = 0
                    print("\nSection not found in ", row['tid'], doc_id,
                          section_idx)
                    print(document[:200])
                    print(section[:200])
                section_to_doc_and_offset_arr[first_section_id_of_doc +
                                              section_idx][0] = doc_id
                # offset start
                section_to_doc_and_offset_arr[first_section_id_of_doc +
                                              section_idx][1] = offset
                # offset end (we deleted characters, which could move the end of the section no_deleted_charaters
                # towards the end. the third row entry stores that value
                section_to_doc_and_offset_arr[
                    first_section_id_of_doc + section_idx][2] = offset + len(
                        section) + no_deleted_characters

        np.save(PATH_TOKENIZED + 'section_to_doc_and_offset_arr.npy',
                section_to_doc_and_offset_arr)

    return section_to_doc_and_offset_arr
def fill_tables():
    '''
    Fill the docs, doc_types, authors, and recipients tables
    Assigns each document an id, sorted by timestamp and TID, then collects and inserts all the necessary data

    :return:
    '''

    db1 = Database("TOBACCO_RAW")
    con1, cur1 = db1.connect()
    db2 = Database("TOBACCO_RAW")
    con2, cur2 = db2.connect()

    valid_tids = get_tid_to_filelength_dict()


    cur1.execute('''SELECT record_key, timestamp, collection_id, id as opt_min_id
                        FROM idl_doc
                        WHERE timestamp IS NOT NULL AND timestamp >= -2177452800 and industry_id=2
                        ORDER BY timestamp, record_key ASC;''')


    # Lists to store data until insert
    doc_types = []
    authors = []
    recipients = []
    docs = []

    idx = 0
    while True:


        # insert and reset after 10000 documents
        if len(docs) >= 10000:
            print("Current id: {}".format(idx))
            batch_insert(docs, doc_types, authors, recipients)
            docs = []
            doc_types = []
            authors = []
            recipients = []

        mand_doc = cur1.fetchone()
        if not mand_doc:
            break


        doc = {'id': idx,
               'tid': mand_doc['record_key'],
               'timestamp': mand_doc['timestamp'],
               # correct for utc to east coast time. (yes, this is an ugly hack but otherwise 1/1/1901 is interpreted as 12/31/1900
               'year': (datetime.datetime.fromtimestamp(mand_doc['timestamp']) + datetime.timedelta(hours=6)).year,
               'collection_id': mand_doc['collection_id'],
               'opt_min_id': mand_doc['opt_min_id'],
               'title': None,
               'pages': None,
        }

        doc_text = get_ocr_by_tid(doc['tid'], return_bytearray=False).lower()
        doc_text = expand_contractions(doc_text)
        doc['no_tokens'] = len(re.findall(WORD_SPLIT_REGEX, doc_text))

        tid = doc['tid']

        # 6/10/17 only add documents that actually contain text.
        if not tid in valid_tids:
            continue

        parsed_doc = parse_opt_min_doc(doc['opt_min_id'], doc['id'], cur2)

        # 1/31 if the document is from a doctype to remove, (e.g. trial list), then remove the document.
        if parsed_doc == 'remove doc':
            print ("Removing doc because trial list", doc)
            continue
        # skip non-tobacco collections
        if not mand_doc['collection_id'] in VALID_COLLECTIONS:
            continue


        doc.update(parsed_doc['doc'])


        try:
            if doc['date_orig'] == '1900':
                print("date == 1900", doc['date_orig'], doc)
                continue
        except KeyError:
            print("no date orig", doc)


        if doc['year'] < 1901:
            print ("Removing doc because < 1901", doc)
            continue
        else:


            docs.append(doc)
            doc_types += parsed_doc['doc_types']

            authors += parsed_doc['authors']
            recipients += parsed_doc['recipients']

            idx += 1

    batch_insert(docs, doc_types, authors, recipients)
def parse_search_tokens(search_tokens, mp_queue=None):

    #search_tokens = ",".join(search_tokens)
    error = ''
    try:
        if len(search_tokens) == 0:
            error = "Please enter one or multiple search terms."
            return {}, error
    except AttributeError:
        print("attr error with search tokens: {}".format(search_tokens))

    # make sure the input only contains valid characters

    if SANITIZE_REGEX.search(" ".join(search_tokens)):
        error = "Search terms can only contain letters, numbers, spaces, commas, and asterisks but not '{}'.\n".format(
            " ".join(set(SANITIZE_REGEX.findall(" ".join(search_tokens)))))
        process_despite_error = False
        if set(SANITIZE_REGEX.findall(" ".join(search_tokens))) == {'-'}:
            error += 'We have replaced the dash with a space.\n'
            search_tokens = [i.replace('-', ' ') for i in search_tokens]
            process_despite_error = True

        for char in ['\'', '\"']:
            if set(SANITIZE_REGEX.findall(" ".join(search_tokens))) == {char}:
                if " ".join(search_tokens).find(char) > -1:
                    print("search tokens before quotation mark", search_tokens)
                    search_tokens = [
                        token.replace(char, '') for token in search_tokens
                    ]
                    print("after", search_tokens)
                    process_despite_error = True
                    error += 'We have removed the quotation marks.\n'

        print("process despite error", process_despite_error)
        if not process_despite_error:
            if mp_queue:
                mp_queue.put(({}, error))
                return
            else:
                return {}, error

    final_tokens = set()
    tokens_not_in_vocabulary = []
    for token in search_tokens:
        if token == '':
            continue
        token = expand_contractions(token)
        if token[0] == '*' or token[-1] == '*':

            if token[0] == token[-1]:
                error = "Wildcard searches can only be done at the beginning or the end of a token but not at both."
                if mp_queue:
                    mp_queue.put(({}, error))
                    return
                else:
                    return {}, error
            else:
                final_tokens = final_tokens.union(
                    wildcard_search(token, ngram=len(token.split())))

        else:
            if check_if_token_in_vocabulary(token):
                final_tokens.add(token)
            else:
                tokens_not_in_vocabulary.append(token)

    if len(tokens_not_in_vocabulary) == 1:
        error += 'The term "{}" does not exist in the vocabulary of tobacco-analytics.\n'.format(
            ", ".join(tokens_not_in_vocabulary))
    elif len(tokens_not_in_vocabulary) > 1:
        error += 'The terms "{}" do not exist in the vocabulary of tobacco-analytics\n'.format(
            ", ".join(tokens_not_in_vocabulary))

    error = error.rstrip('\n')
    print("final cleaned search tokens", search_tokens, error)

    if mp_queue:
        mp_queue.put((final_tokens, error))
    else:
        return final_tokens, error