Exemplo n.º 1
0
 def tokenize_for_phrases(self, phrase):
     """Return list of phrases found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     phrase = wash_for_utf8(phrase)
     return [phrase]
def assemble_caption(begin_line, begin_index, end_line, end_index, lines):
    """
    Take write_messageation about the caption of a picture and put it all together
    in a nice way.  If it spans multiple lines, put it on one line.  If it
    contains controlled characters, strip them out.  If it has tags we don't
    want to worry about, get rid of them, etc.

    @param: begin_line (int): the index of the line where the caption begins
    @param: begin_index (int): the index within the line where the caption
        begins
    @param: end_line (int): the index of the line where the caption ends
    @param: end_index (int): the index within the line where the caption ends
    @param: lines ([string, string, ...]): the line strings of the text

    @return: caption (string): the caption, nicely formatted and pieced together
    """

    # stuff we don't like
    label_head = "\\label{"

    # reassemble that sucker
    if end_line > begin_line:
        # our caption spanned multiple lines
        caption = lines[begin_line][begin_index:]

        for included_line_index in range(begin_line + 1, end_line):
            caption = caption + " " + lines[included_line_index]

        caption = caption + " " + lines[end_line][:end_index]
        caption = caption.replace("\n", " ")
        caption = caption.replace("  ", " ")
    else:
        # it fit on one line
        caption = lines[begin_line][begin_index:end_index]

    # clean out a label tag, if there is one
    label_begin = caption.find(label_head)
    if label_begin > -1:
        # we know that our caption is only one line, so if there's a label
        # tag in it, it will be all on one line.  so we make up some args
        dummy_start, dummy_start_line, label_end, dummy_end = find_open_and_close_braces(0, label_begin, "{", [caption])
        caption = caption[:label_begin] + caption[label_end + 1 :]

    # clean out characters not allowed in MARCXML
    # not allowed: & < >
    try:
        caption = wash_for_utf8(caption)
        caption = encode_for_xml(caption.encode("utf-8", "xmlcharrefreplace"), wash=True)
    except:  # that damn encode thing threw an error on astro-ph/0601014
        sys.stderr.write(caption)
        sys.stderr.write(" cannot be processed\n")
        caption = caption.replace("&", "&amp;").replace("<", "&lt;")
        caption = caption.replace(">", "&gt;")

    caption = caption.strip()

    if len(caption) > 1 and caption[0] == "{" and caption[-1] == "}":
        caption = caption[1:-1]

    return caption
Exemplo n.º 3
0
 def tokenize(self, phrase):
     """Return list of phrases found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     phrase = wash_for_utf8(phrase)
     return [phrase]
     ## Note that we don't break phrases, they are used for exact style
     ## of searching.
     words = {}
     phrase = strip_accents(phrase)
     # 1st split phrase into blocks according to whitespace
     for block1 in phrase_delimiter_re.split(strip_accents(phrase)):
         block1 = block1.strip()
         if block1 and self.stemming_language:
             new_words = []
             for block2 in re_punctuation.split(block1):
                 block2 = block2.strip()
                 if block2:
                     for block3 in block2.split():
                         block3 = block3.strip()
                         if block3:
                             # Note that we don't stem phrases, they
                             # are used for exact style of searching.
                             new_words.append(block3)
             block1 = ' '.join(new_words)
         if block1:
             words[block1] = 1
     return words.keys()
Exemplo n.º 4
0
 def tokenize_for_phrases(self, phrase):
     """Return list of phrases found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     phrase = wash_for_utf8(phrase)
     return [phrase]
 def test_chinese_string_washing(self):
     """textutils - testing washing functions on chinese script"""
     some_str = """春眠暁を覚えず
     処処に啼鳥と聞く
     夜来風雨の声
     花落つること
     知んぬ多少ぞ"""
     self.assertEqual(some_str, wash_for_utf8(some_str))
Exemplo n.º 6
0
 def test_chinese_string_washing(self):
     """textutils - testing washing functions on chinese script"""
     some_str = """春眠暁を覚えず
     処処に啼鳥と聞く
     夜来風雨の声
     花落つること
     知んぬ多少ぞ"""
     self.assertEqual(some_str, wash_for_utf8(some_str))
Exemplo n.º 7
0
 def tokenize_for_phrases(self, phrase):
     """
         Another name for tokenize_for_fuzzy_authors.
         It's for the compatibility.
         See: tokenize_for_fuzzy_authors
     """
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     return self.tokenize_for_fuzzy_authors(strip_accents(phrase))
Exemplo n.º 8
0
    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block,
                                               self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(
                        subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock,
                                                      self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(
                            alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(
                            stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(
                            stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()
Exemplo n.º 9
0
 def tokenize_for_words(self, phrase):
     """
         If CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES is 1 we tokenize only for family names.
         In other case we perform standard tokenization for words.
     """
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     phrase = strip_accents(phrase)
     if CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES:
         return self.get_author_family_name_words_from_phrase(phrase)
     else:
         return self.tokenize_for_words_default(phrase)
 def tokenize_for_words(self, phrase, recid):
     """Return list of words found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     if not self.isAuthority(recid):
         return []
     words = {}
     formulas = []
     if self.remove_html_markup and phrase.find("</") > -1:
         phrase = remove_html_markup(phrase)
     if self.remove_latex_markup:
         formulas = latex_formula_re.findall(phrase)
         phrase = remove_latex_markup(phrase)
         phrase = latex_formula_re.sub(" ", phrase)
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     # 1st split phrase into blocks according to whitespace
     for block in strip_accents(phrase).split():
         # 2nd remove leading/trailing punctuation and add block:
         block = re_block_punctuation_begin.sub("", block)
         block = re_block_punctuation_end.sub("", block)
         if block:
             stemmed_block = remove_stopwords(block, self.remove_stopwords)
             stemmed_block = length_check(stemmed_block)
             stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
             if stemmed_block:
                 words[stemmed_block] = 1
             if re_arxiv.match(block):
                 # special case for blocks like `arXiv:1007.5048' where
                 # we would like to index the part after the colon
                 # regardless of dot or other punctuation characters:
                 words[block.split(":", 1)[1]] = 1
             # 3rd break each block into subblocks according to punctuation and add subblocks:
             for subblock in re_punctuation.split(block):
                 stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
                 stemmed_subblock = length_check(stemmed_subblock)
                 stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
                 if stemmed_subblock:
                     words[stemmed_subblock] = 1
                 # 4th break each subblock into alphanumeric groups and add groups:
                 for alphanumeric_group in re_separators.split(subblock):
                     stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                     stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
                     stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
                     if stemmed_alphanumeric_group:
                         words[stemmed_alphanumeric_group] = 1
     for block in formulas:
         words[block] = 1
     return words.keys()
Exemplo n.º 11
0
def create_contextfiles(extracted_image_data):
    """
    Saves the context for each image to a file in the current sub-directory,
    returning a list of tuples per file saved in this form: [(image, filename), ..]

    @param extracted_image_data ([(string, string, list, list), ...]):
        a list of tuples of images matched to labels, captions and contexts from
        this document.
    """
    for image, dummy2, dummy3, contexts in extracted_image_data:
        if len(contexts) > 0 and image != "":
            context_filepath = image + '.context'
            fd = open(context_filepath, 'w')
            for context_line in contexts:
                fd.write(wash_for_utf8(context_line) + '\n\n')
            fd.close()
Exemplo n.º 12
0
def create_contextfiles(extracted_image_data):
    """
    Saves the context for each image to a file in the current sub-directory,
    returning a list of tuples per file saved in this form: [(image, filename), ..]

    @param extracted_image_data ([(string, string, list, list), ...]):
        a list of tuples of images matched to labels, captions and contexts from
        this document.
    """
    for image, dummy2, dummy3, contexts in extracted_image_data:
        if len(contexts) > 0 and image != "":
            context_filepath = image + '.context'
            fd = open(context_filepath, 'w')
            for context_line in contexts:
                fd.write(wash_for_utf8(context_line) + '\n\n')
            fd.close()
Exemplo n.º 13
0
    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock,
                                                self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(
                                subblock):
                            alphanumeric_group = remove_stopwords(
                                alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(
                                alphanumeric_group)
                            alphanumeric_group = apply_stemming(
                                alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' %
                                          (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()
Exemplo n.º 14
0
 def test_russian_characters_washing(self):
     """textutils - washing Russian characters for UTF-8"""
     self.assertEqual(wash_for_utf8('''
     В тени дерев, над чистыми водами
     Дерновый холм вы видите ль, друзья?
     Чуть слышно там плескает в брег струя;
     Чуть ветерок там дышит меж листами;
     На ветвях лира и венец...
     Увы! друзья, сей холм - могила;
     Здесь прах певца земля сокрыла;
     Бедный певец!'''), '''
     В тени дерев, над чистыми водами
     Дерновый холм вы видите ль, друзья?
     Чуть слышно там плескает в брег струя;
     Чуть ветерок там дышит меж листами;
     На ветвях лира и венец...
     Увы! друзья, сей холм - могила;
     Здесь прах певца земля сокрыла;
     Бедный певец!''')
 def tokenize_for_pairs(self, phrase, recid):
     """Return list of words found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     if not self.isAuthority(recid):
         return []
     words = {}
     if self.remove_html_markup and phrase.find("</") > -1:
         phrase = remove_html_markup(phrase)
     if self.remove_latex_markup:
         phrase = remove_latex_markup(phrase)
         phrase = latex_formula_re.sub(" ", phrase)
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     # 1st split phrase into blocks according to whitespace
     last_word = ""
     for block in strip_accents(phrase).split():
         # 2nd remove leading/trailing punctuation and add block:
         block = re_block_punctuation_begin.sub("", block)
         block = re_block_punctuation_end.sub("", block)
         if block:
             block = remove_stopwords(block, self.remove_stopwords)
             block = length_check(block)
             block = apply_stemming(block, self.stemming_language)
             # 3rd break each block into subblocks according to punctuation and add subblocks:
             for subblock in re_punctuation.split(block):
                 subblock = remove_stopwords(subblock, self.remove_stopwords)
                 subblock = length_check(subblock)
                 subblock = apply_stemming(subblock, self.stemming_language)
                 if subblock:
                     # 4th break each subblock into alphanumeric groups and add groups:
                     for alphanumeric_group in re_separators.split(subblock):
                         alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                         alphanumeric_group = length_check(alphanumeric_group)
                         alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
                         if alphanumeric_group:
                             if last_word:
                                 words["%s %s" % (last_word, alphanumeric_group)] = 1
                             last_word = alphanumeric_group
     return words.keys()
Exemplo n.º 16
0
 def test_empty_string_wash(self):
     """textutils - washing an empty string"""
     self.assertEqual(wash_for_utf8(""), "")
Exemplo n.º 17
0
 def test_remove_incorrect_unicode_characters(self):
     """textutils - washing out the incorrect characters"""
     self.assertEqual(wash_for_utf8("Ź\206dź\204bło żół\203wia \202"),
                      "Źdźbło żółwia ")
Exemplo n.º 18
0
 def test_normal_legal_string_washing(self):
     """textutils - testing UTF-8 washing on a perfectly normal string"""
     some_str = "This is an example string"
     self.assertEqual(some_str, wash_for_utf8(some_str))
Exemplo n.º 19
0
        sys.stderr.write("Error: Missing MARCXML to analyse")
        print usage
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Read and wash incoming data
    file_data = open_marc_file(input_filename)
    washed_data = wash_for_xml(wash_for_utf8(file_data))

    # Transform MARCXML to record structure
    records = create_records(washed_data)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        if record is None:
            sys.stderr.write("Record is None: %s" % (rec[2],))
            sys.exit(1)
        # Perform various checks to determine an suitable action to be taken for
 def test_only_incorrect_unicode_wash(self):
     """textutils - washing an empty string"""
     self.assertEqual(wash_for_utf8("\202\203\204\205"), "")
Exemplo n.º 21
0
def assemble_caption(begin_line, begin_index, end_line, end_index, lines):
    """
    Take write_messageation about the caption of a picture and put it all together
    in a nice way.  If it spans multiple lines, put it on one line.  If it
    contains controlled characters, strip them out.  If it has tags we don't
    want to worry about, get rid of them, etc.

    @param: begin_line (int): the index of the line where the caption begins
    @param: begin_index (int): the index within the line where the caption
        begins
    @param: end_line (int): the index of the line where the caption ends
    @param: end_index (int): the index within the line where the caption ends
    @param: lines ([string, string, ...]): the line strings of the text

    @return: caption (string): the caption, nicely formatted and pieced together
    """

    # stuff we don't like
    label_head = '\\label{'

    # reassemble that sucker
    if end_line > begin_line:
        # our caption spanned multiple lines
        caption = lines[begin_line][begin_index:]

        for included_line_index in range(begin_line + 1, end_line):
            caption = caption + ' ' + lines[included_line_index]

        caption = caption + ' ' + lines[end_line][:end_index]
        caption = caption.replace('\n', ' ')
        caption = caption.replace('  ', ' ')
    else:
        # it fit on one line
        caption = lines[begin_line][begin_index:end_index]

    # clean out a label tag, if there is one
    label_begin = caption.find(label_head)
    if label_begin > -1:
        # we know that our caption is only one line, so if there's a label
        # tag in it, it will be all on one line.  so we make up some args
        dummy_start, dummy_start_line, label_end, dummy_end = \
                find_open_and_close_braces(0, label_begin, '{', [caption])
        caption = caption[:label_begin] + caption[label_end + 1:]

    # clean out characters not allowed in MARCXML
    # not allowed: & < >
    try:
        caption = wash_for_utf8(caption)
        caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'),
                                 wash=True)
    except:  # that damn encode thing threw an error on astro-ph/0601014
        sys.stderr.write(caption)
        sys.stderr.write(' cannot be processed\n')
        caption = caption.replace('&', '&amp;').replace('<', '&lt;')
        caption = caption.replace('>', '&gt;')

    caption = caption.strip()

    if len(caption) > 1 and caption[0] == '{' and caption[-1] == '}':
        caption = caption[1:-1]

    return caption
 def test_remove_incorrect_unicode_characters(self):
     """textutils - washing out the incorrect characters"""
     self.assertEqual(wash_for_utf8("Ź\206dź\204bło żół\203wia \202"), "Źdźbło żółwia ")
 def test_normal_legal_string_washing(self):
     """textutils - testing UTF-8 washing on a perfectly normal string"""
     some_str = "This is an example string"
     self.assertEqual(some_str, wash_for_utf8(some_str))
Exemplo n.º 24
0
 def test_already_utf8_input(self):
     """textutils - washing a Unicode string into UTF-8 binary string"""
     self.assertEqual('Göppert', wash_for_utf8(u'G\xf6ppert', True))
Exemplo n.º 25
0
 def test_only_incorrect_unicode_wash(self):
     """textutils - washing an empty string"""
     self.assertEqual(wash_for_utf8("\202\203\204\205"), "")
 def test_empty_string_wash(self):
     """textutils - washing an empty string"""
     self.assertEqual(wash_for_utf8(""), "")
Exemplo n.º 27
0
 def test_already_utf8_input(self):
     """textutils - washing a Unicode string into UTF-8 binary string"""
     self.assertEqual('Göppert', wash_for_utf8(u'G\xf6ppert', True))
Exemplo n.º 28
0
        sys.stderr.write("Error: Missing MARCXML to analyse")
        print usage
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Read and wash incoming data
    file_data = open_marc_file(input_filename)
    washed_data = wash_for_xml(wash_for_utf8(file_data))

    # Transform MARCXML to record structure
    records = create_records(washed_data)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        if record is None:
            sys.stderr.write("Record is None: %s" % (rec[2], ))
            sys.exit(1)
        # Perform various checks to determine an suitable action to be taken for