Пример #1
0
 def make_annotation_from_pattrn(self, locale: str, ptrn: PatternFound,
                                 phrase: LineOrPhrase) -> TextAnnotation:
     ant = CopyrightAnnotation(name=ptrn.name,
                               coords=(ptrn.start, ptrn.end),
                               text=phrase.text[ptrn.start:ptrn.end],
                               locale=locale)
     ant.company = ptrn.company  # pattern in in fact CopyrightPatternFound
     ant.year_start = ptrn.start_year
     ant.year_end = ptrn.end_year
     return ant
    def split_copyright_date(cls, ant: CopyrightAnnotation) -> None:
        if not ant.date:
            return
        years = [int(y.group()) for y in cls.copyright_dates_re.finditer(ant.date)]
        if len(years) == 2:
            if 10000 > years[0] > 100 and years[1] >= years[0]:
                ant.year_start = years[0]
                ant.year_end = years[1]
                return

        if len(years) == 1 and 10000 > years[0] > 100:
            ant.year_start = years[0]
 def derive_company_name(cls,
                        ant: CopyrightAnnotation,
                        phrase: str) -> None:
     if ant.company:
         ant.company = ant.company.strip(' ,;-(:')
         if cls.reg_valid_company_name.search(ant.company):
             return
         ant.company = ''
     possible_names = [n.group(0) for n in cls.reg_company_name.finditer(ant.name)]
     if not possible_names:
         possible_names = [n.group(0) for n in cls.reg_company_name.finditer(phrase)]
     if possible_names:
         ant.company = cls.take_best_company_name(possible_names)
         ant.company = ant.company.strip(' ,;-(:')
Пример #4
0
    def get_copyright_annotations(cls, text: str, return_sources=False) \
            -> Generator[CopyrightAnnotation, None, None]:
        """
        Find copyright in text.
        :param text:
        :param return_sources:
        :return:
        """
        # Iterate through sentences
        if not cls.copyright_ptn_re.search(text):
            return

        for sent_start, _, sentence in get_sentence_span_list(text):
            tagged_phrases = cls.extract_phrases_with_coords(sentence)

            for phrase, phrase_start in tagged_phrases:
                for match in cls.copyright_ptn_re.finditer(phrase):
                    cp_text, cp_sign, cp_date, cp_name = match.groups()

                    # TODO: catch in the general regex
                    if not cp_date:
                        cp_date_at_end = cls.year_ptn_re.search(cp_name)
                        if cp_date_at_end:
                            cp_date = cp_date_at_end.group()
                            cp_name = re.sub(r'{}$'.format(cp_date), '',
                                             cp_name)

                    start, end = match.span()
                    start += phrase_start + sent_start
                    end += phrase_start
                    ant = CopyrightAnnotation(
                        coords=(start, end),
                        sign=cp_sign.strip(),
                        date=cp_date,
                        name=cp_name.strip(string.punctuation +
                                           string.whitespace))

                    if return_sources:
                        ant.text = cp_text.strip()
                    cls.split_copyright_date(ant)
                    cls.derive_company_name(ant, phrase)
                    yield ant
Пример #5
0
    def test_format_copyright_annotation(self):
        cp = CopyrightAnnotation(name='Siemens',
                                 coords=(0, 100),
                                 text='text text',
                                 locale='locale')
        cp.company = 'Siemens'
        cp.year_start = 1996
        s = cp.get_cite()  # '/copyright/Siemens/1996'
        self.assertGreater(s.find('copyright'), -1)
        self.assertGreater(s.find('Siemens'), -1)
        self.assertGreater(s.find('1996'), -1)

        cp.year_end = 2019
        cp.locale = 'en'
        s = cp.get_cite()  # '/en/copyright/Siemens/1996/2019'
        self.assertGreater(s.find('copyright'), -1)
        self.assertGreater(s.find('Siemens'), -1)
        self.assertGreater(s.find('1996'), -1)
        self.assertGreater(s.find('2019'), -1)
Пример #6
0
 def test_copyright_annotation(self):
     ant = CopyrightAnnotation(coords=(2, 20),
                               year_start=1998,
                               year_end=2001)
     cite = ant.get_cite()
     self.assertEqual('/en/copyright/1998/2001', cite)