示例#1
0
    def parse_line(self):
        line_number = 1
        qline_begin = self.appconfig.get_value('question_lines', 'start')
        qline_end = self.appconfig.get_value('question_lines', 'end')
        possible_answer_line = self.appconfig.get_value(
            'possible_answers_line')

        for line in self.readlines():

            if line_number == self.appconfig.get_value('paragraph_line'):
                para = Paragraph(line, self.appconfig)
                para.parse()
                yield ("paragraph", para)

            elif line_number >= qline_begin and line_number <= qline_end:
                line.replace('?', '')
                qa = QuestionAnswer(line)
                yield ("question_answer", qa)

            elif line_number == possible_answer_line:
                pa = PossibleAnswers(line, self.appconfig)
                pa.parse()
                yield ("possible_answers", pa)

            else:
                raise ValueError('Invalid input, number of lines exceeded')

            line_number += 1
示例#2
0
    def test_init(self):
        sentence = '/>あいうえお'
        p = Paragraph(sentence)
        self.assertEqual(p.sentences, ['あいうえお'])

        sentence = 'あいうえお<a href="somewhere">かき</a>くけこ'
        p = Paragraph(sentence)
        self.assertEqual(p.sentences, ['あいうえおかきくけこ'])
def generate_novel():
    characters = [Person(), Person(), Person()]
    for i in range(5):
        scene_settings.append(generate_setting())
    for setting in scene_settings:
        paragraph = Paragraph(characters, setting)
        paragraph.generate_sentences()
        print paragraph
def make_paragraphs(raw_data_path, decorated_data_path, urn, aux_info):
    text, info = "", ""  # from urn
    aux = decorate(info, aux_info)
    paragraphs = text.split('\n')
    for p in paragraphs:
        pg = PG(p, info)
        pg.set_aux_info(aux)
        pg.save_to_file(decorated_data_path)
示例#5
0
    def save_section_features_to_db(self, paper_ids, list_authors,
                                    list_authors_id_200):
        con = psycopg2.connect("dbname ='%s' user='******' host=/tmp/" %
                               (self.db_name.lower(), getpass.getuser()))
        cur = con.cursor()

        index = {}
        for m in list_authors_id_200:
            index[m] = 0

        num_section = 1
        chunk_id = 1

        for i in range(0, self.num_paper):
            tokens_sum = []
            for j in range(0, len(list_authors[i])):
                novel_id = paper_ids[i]
                index[list_authors[i][j]] += 1

                raw_novel_text = self.get_raw_text(novel_id)
                tokens = nltk.word_tokenize(raw_novel_text.decode('utf-8'))
                tokens_sum += tokens[0:self.token_size / len(list_authors[i])]

                cur.execute("INSERT INTO section VALUES(%s,%s,%s,%s,%s)", [
                    i + 1, num_section, raw_novel_text, novel_id,
                    list_authors[i][j]
                ])
                num_section += 1

            paragraphs = self.get_paragraphs(tokens_sum)

            for x in range(0, len(paragraphs)):
                para = Paragraph("paper_id", para=paragraphs[x])
                stylo_list = []
                try:
                    stylo_list = para.get_stylo_list()
                except:
                    raise
                    print('error')
                for y in range(0, 57):
                    feature_id = y + 1
                    try:
                        value = 0 if math.isnan(
                            stylo_list[y]) else stylo_list[y]
                    except:
                        value = 0
                    cur.execute(
                        "INSERT INTO features VALUES (%s, %s, %s, %s) " %
                        (i + 1, chunk_id, feature_id, value))
                chunk_id += 1

            con.commit()
            print("saved section no %s" % (i + 1))

        con.close()
        cur.close()
示例#6
0
    def draw(self,page,context):
        if not self.reportElement.is_printable(context):
            return
        
        P=Paragraph(self.text.data,self.textElement.get_style())

        w,h = P.wrap(self.reportElement['width'],self.reportElement['height'])

        x,y = page.translate(self.reportElement['x'],self.reportElement['y'])
        P.drawOn(page.canvas,x,y)
示例#7
0
    def test_split_by_dots(self):
        p = Paragraph('a')

        html_items = p.split_by_dots('abc')
        self.assertEqual(html_items, ['abc'])

        html_items2 = p.split_by_dots('a!bc')
        self.assertEqual(html_items2, ['a', 'bc'])

        html_items2 = p.split_by_dots('a。bc')
        self.assertEqual(html_items2, ['a', 'bc'])
示例#8
0
    def test_contains_nomenclature(self):
        self.pp.append_ahead(Line('hamster'))
        self.pp.append_ahead(
            Line('≡ Polyporus mori (Pollini) Fr., Systema Mycologicum 1:'))
        self.pp.append_ahead(Line('344 (1821)'))
        self.pp.close()
        self.assertTrue(self.pp.contains_nomenclature())

        pp2 = Paragraph()
        pp2.append(Line('Araneosa columellata Long, Mycologia 33 (1941) 353.'))
        self.assertTrue(pp2.contains_nomenclature())
示例#9
0
 def setUp(self):
     self.pp = Paragraph()
     self.pp.append_ahead(
         Line(
             'Julella sublactea (Nylander) R.C. Harris in Egan, Bryologist 90: 163. 1987;\n'
         ))
     self.pp.append_ahead(
         Line(
             'Verrucaria sublactea Nylander, Flora 69: 464. 1886. syn. nov.\n'
         ))
     self.pp.close()
示例#10
0
    def test_split_by_dots(self):
        p = Paragraph('a')

        html_items = p.split_by_dots('abc')
        self.assertEqual(html_items, ['abc'])

        html_items2 = p.split_by_dots('a!bc')
        self.assertEqual(html_items2, ['a', 'bc'])

        html_items2 = p.split_by_dots('a。bc')
        self.assertEqual(html_items2, ['a', 'bc'])
示例#11
0
def load_docs(docs_file_path):
    docs = []

    DOMTree = minidom.parse(docs_file_path)
    collection = DOMTree.documentElement

    docElements = collection.getElementsByTagName("doc")

    for docEl in docElements:
        paragraphs = []
        paragraphElements = docEl.getElementsByTagName("p")

        for pEl in paragraphElements:

            concepts = {}
            conceptElements = pEl.getElementsByTagName("concept")
            for cEl in conceptElements:
                name = cEl.getAttribute("name")
                freq = int(cEl.getAttribute("freq"))
                concepts[name] = freq

            p = Paragraph(concepts)
            paragraphs.append(p)
        doc = Document(paragraphs)
        docs.append(doc)

    return docs
示例#12
0
    def draw(self,page,context):
        if not self.reportElement.is_printable(context):
            return

        value = self.resolve_expression(self.fieldExpression.data,context)
        if value is None and self['isBlankWhenNull']:
            value = ''
        else:
            value = self.format(value)

        P=Paragraph(value,self.textElement.get_style())

        w,h = P.wrap(self.reportElement['width'], self.reportElement['height'])

        x,y = page.translate(self.reportElement['x'],self.reportElement['y'])
        P.drawOn(page.canvas,x,y)
示例#13
0
 def summarize(self, content):
     summarized_content_array = []
     for paragraph in content:
         index = paragraph.index
         title = paragraph.title
         content = self.summarize_paragraph(paragraph)
         p = Paragraph(index, title, content)
         summarized_content_array.append(p)
     return summarized_content_array
示例#14
0
    def write_paragraph(self, section_number, content):
        """ Writes paragraph to paragraph diction with section as key, content as value"""
        if section_number == 0:
            title = "Summary"
        else:
            title = self.get_section(section_number - 1)

        paragraph = Paragraph(section_number, title, content)
        self.paragraphs.append(paragraph)
示例#15
0
    def document(self):
        current_paragraph = []
        paragraphs = []
        for line in self._text.splitlines():
            line = line.strip()
            if line.isupper():
                heading = Sentence(line, self._tokenizer, is_heading=True)
                current_paragraph.append(heading)
            elif not line and current_paragraph:
                sentences = self._to_sentences(current_paragraph)
                paragraphs.append(Paragraph(sentences))
                current_paragraph = []
            elif line:
                current_paragraph.append(line)

        sentences = self._to_sentences(current_paragraph)
        paragraphs.append(Paragraph(sentences))

        return ObjectDocumentModel(paragraphs)
示例#16
0
    def summarize(self, content):
        summarized_content_array = []
        for paragraph in content:
            summarized_content = self.get_first_sentance(paragraph.content)
            title = paragraph.title
            index = paragraph.index
            summarized_paragraph = Paragraph(index, title, summarized_content)

            summarized_content_array.append(summarized_paragraph)

        return summarized_content_array
示例#17
0
    def read_raw_text(self, raw_text_path):
        characters = ''.join(open(raw_text_path).readlines())
        begin = 8  # each article begins with ".START\n\n"
        pid = 0
        while 1:
            pc = characters.find('\n\n', begin)
            if pc == -1:
                break
            self.paragraphs.append(Paragraph(begin, pc, pid))
            pid += 1
            begin = pc + 2  # '\n\n'

        for sen in self.sentences:
            flag = False
            for para in self.paragraphs:
                if sen.begin_offset >= para.begin_offset and sen.end_offset <= para.end_offset:
                    para.sentences.append(sen)
                    flag = True
                    break
            if not flag:
                print >> logs, 'sentence outof paragraph'
示例#18
0
def parse_annotated(contents: Iterable[Line]) -> Iterator[Paragraph]:
    """Return paragraphs in annotated block form.

    Do not apply heuristic methods to divide paragraphs."""
    pp = Paragraph()
    for line in contents:
        pp.append_ahead(line)

        if line.contains_start():
            (retval, pp) = pp.next_paragraph()
            yield retval
            continue

        if pp.last_line and pp.last_line.end_label() is not None:
            (retval, pp) = pp.next_paragraph()
            yield retval
            continue
示例#19
0
def retrieve_forbes_article_data(article, company_list,
                                 company_name_automaton):

    article.paragraphs = []

    browser.get((article.url))

    try:
        article_text_section = browser.find_element_by_xpath(
            '//*[@id="article-container-0"]/div[2]/div[2]/article-body-container/div/div'
        )
        article_paragraph_tags = article_text_section.find_elements_by_tag_name(
            'p')
    except:
        return

    all_tickers = []

    for paragraph in article_paragraph_tags:

        if '[+]' in paragraph.text:
            continue
        tickers = []
        for end_index, idx in company_name_automaton.iter(
                paragraph.text.upper()):
            company_formatted = company_list[idx]
            company_formatted = company_formatted[1:]
            company_formatted = company_formatted[:-1]
            tickers.append(company_formatted)

        all_tickers.extend(tickers)
        article.paragraphs.append(Paragraph(paragraph.text, tickers))
        article.webpage_text = article.webpage_text + paragraph.text

    article.tickers = all_tickers
    article.tickers = list(dict.fromkeys(article.tickers))
    return article
示例#20
0
 def reset(self):
     super().reset()
     self.state = 'start'
     self.paragraph = Paragraph()
示例#21
0
 def __init__(self, paragraphs_actions: List[ParagraphsAction] = []):
     super().__init__()
     self.paragraphs_actions = paragraphs_actions
     self.paragraph = Paragraph()
示例#22
0
class BlogParser(MachineHTMLParser):
    paragraphs_actions: List[ParagraphsAction]
    paragraph: Paragraph

    def __init__(self, paragraphs_actions: List[ParagraphsAction] = []):
        super().__init__()
        self.paragraphs_actions = paragraphs_actions
        self.paragraph = Paragraph()

    # utilities

    def parse_file(self, filename: str, rel: Optional[str] = None):
        self.reset()
        if isinstance(rel, str):
            self.paragraph.filename = Path(rel).relative_to(rel)
        else:
            self.paragraph.filename = filename
        super().parse_file(filename)

    def parse_date(self, time: str) -> str:
        time_ = time.strip()
        fmt = '%A, %b %d, %Y'  # example: Monday, Jan 1, 2015
        try:
            return datetime.strptime(time_, fmt).isoformat()
        except ValueError:
            log.error(f'Invalid date format:"{time_}"')
            log.error(f'Invalid date format @ {self.location()}')
            return ""

    # reduce middleware once paragraph is read

    def push_paragraph(self):
        reduce(lambda x, f: f(x), self.paragraphs_actions, [self.paragraph])
        self.paragraph = self.paragraph.new_paragraph()

    # state-machine logic

    def reset(self):
        super().reset()
        self.state = 'start'
        self.paragraph = Paragraph()

    def validate_transition(self, to_state: State):
        return (self.state, to_state) in valid_transitions

    def dispatch(self, ms: TransitionData, attrs: Attrs = {}):
        if ms == ('start', 'header', 'starttag'):
            self.transition('metadata')

        elif ms == ('metadata', 'h1', 'starttag'):
            self.transition('title')

        elif ms == ('metadata', 'Author', 'DATA'):
            self.transition('author_1')

        elif ms == ('metadata', 'Date', 'DATA'):
            self.transition('date_1')

        elif ms == ('metadata', 'header', 'endtag'):
            self.transition('article')

        elif ms == ('title', '*', 'DATA'):
            self.paragraph.article_title = ms.tagOrData

        elif ms == ('title', 'h1', 'endtag'):
            self.transition('metadata')

        elif ms == ('author_1', 'p', 'starttag'):
            self.transition('author_2')

        elif ms == ('author_2', '*', 'DATA'):
            self.paragraph.author = ms.tagOrData
            self.transition('metadata')

        elif ms == ('date_1', 'p', 'starttag'):
            self.transition('date_2')

        elif ms == ('date_2', '*', 'DATA'):
            self.paragraph.date = self.parse_date(ms.tagOrData)
            self.transition('metadata')

        elif ms == ('article', 'h[23]', 'starttag'):
            self.push_paragraph()
            self.transition('subtitle')

        elif ms == ('subtitle', 'h[23]', 'endtag'):
            self.transition('article')

        elif ms == ('article', 'article', 'endtag'):
            self.push_paragraph()
            self.transition('done')

        elif ms == ('article', '*', 'DATA'):
            self.paragraph.text += ms.tagOrData

        # we keep <code> and <p> tags for use in chunking text later

        elif ms == ('article', 'p', 'starttag'):
            self.paragraph.text += "<p>"

        elif ms == ('article', 'p', 'endtag'):
            self.paragraph.text += "</p>"

        elif ms == ('article', 'code', 'starttag'):
            self.paragraph.text += '<code>'

        elif ms == ('article', 'code', 'endtag'):
            self.paragraph.text += '</code>'

        elif ms == ('subtitle', '*', 'DATA'):
            self.paragraph.paragraph_title += ms.tagOrData
示例#23
0
    def endElement(self, name):
        if name == 'empty-line':
            self.add_empty_line()
        elif self.is_body and name != 'body':
            data = ''.join(self.cur_data)
            if name in ['strong', 'emphasis', 'a', 'style']:
                if not self.cur_attr:
                    ##print 'FB2 PARSER ERROR: nested styles?'
                    return
                self.cur_attr.insert(1, len(data))
                self.attrs.append(self.cur_attr)
                self.cur_attr = []
                return
            if data and data.strip():
                if self.is_title:
                    self.add_empty_line()
                    self.content.append(
                        Paragraph(
                            'title', data,
                            attrs=self.attrs,
                            lang=self.lang,
                            id=self.cur_id,
                            byte_index=_parser.CurrentByteIndex,
                        )
                    )
                elif self.is_epigraph and name == 'p':
                    self.content.append(
                        Paragraph(
                            'epigraph', data,
                            attrs=self.attrs,
                            lang=self.lang,
                            id=self.cur_id,
                            byte_index=_parser.CurrentByteIndex
                        )
                    )
                elif self.is_cite and name == 'p':
                    self.content.append(
                        Paragraph(
                            'cite', data,
                            attrs=self.attrs,
                            lang=self.lang,
                            id=self.cur_id,
                            byte_index=_parser.CurrentByteIndex
                        )
                    )
                else:
                    self.content.append(
                        Paragraph(
                            name, data,
                            attrs=self.attrs,
                            lang=self.lang,
                            id=self.cur_id,
                            byte_index=_parser.CurrentByteIndex
                        )
                    )
                self.prev_paragraph_is_empty = False
                self.attrs = []
                self.id = None

        if name == 'description':
            self.is_desc = False
        elif name == 'body':
            self.is_body = False
        elif name == 'epigraph':
            self.is_epigraph = False
            self.add_empty_line()
        elif name == 'cite':
            self.is_cite = False
        elif name == 'title':
            self.is_title = False
            self.add_empty_line()
        elif name in ('subtitle', 'image', 'poem'):
            self.add_empty_line()
        elif name == 'lang':
            self.lang = ''.join(self.cur_data).strip()

        #del self.elem_stack[-1]
        self.cur_data = []
        self.links = {}
示例#24
0
 def add_empty_line(self):
     if not self.prev_paragraph_is_empty:
         self.content.append(Paragraph('empty-line', '',
                                       byte_index=_parser.CurrentByteIndex))
         self.prev_paragraph_is_empty = True
示例#25
0
def writeCoursesWithDescriptions(ws):

    global document
    global courses
    global coursesWithDescriptions
    courseDescriptionOn = 'false'
    allparagraphs = []
    pertinentParagraphs = []
    global number
    global default_format
    global description_format

    writeFile = "pertinentParagraphs.csv"
    pHeader = ""
    headerIsCourseTitle = 'false'

    for p in document.paragraphs:
        if p.text.strip() != "":
            number = number + 1
            paragraph = Paragraph()
            paragraph.setNumber(number)
            paragraph.setStyle(p.style.name)
            paragraph.setText(p.text.strip().rstrip(string.digits))
            allparagraphs.append(paragraph)

    for i, item in enumerate(allparagraphs):
        if item.getStyle() == constant.STYLE_BODY_TEXT:
            pPrev = allparagraphs[i - 1]
            pHeader = ""
            if pPrev.getStyle() == constant.STYLE_NORMAL:
                number = number + 1
                p2 = Paragraph()
                p2.setNumber(number)
                p2.setStyle(pPrev.getStyle())
                p2.setText(pPrev.getText())
                paragraphs2.append(p2)
                pHeader = pPrev.getText().strip()
                if pHeader != "":
                    headerIsCourseTitle = HeaderIsCourseTitle(pHeader)

            if headerIsCourseTitle == 'true':
                number = number + 1
                pc = Paragraph()
                pc.setNumber(number)
                pc.setStyle(item.getStyle())
                pc.setHeader(pHeader)
                pc.setText(item.getText())
                pertinentParagraphs.append(pc)

    associatedCourse = Course()
    currentCourseDescription = ""
    courseTitlePosition = -1

    row = 0
    col = 0
    ws.write(row, col, 'Knowledge Area')
    col += 1
    ws.write(row, col, 'Course Title')
    col += 1
    ws.write(row, col, 'Description')
    col = 0
    row += 1

    for i, p in enumerate(pertinentParagraphs):
        courseTitleCandidate = p.getHeader().strip()
        if p.getHeader().strip(
        ) != "" and courseDescriptionOn == 'true' and i > courseTitlePosition:
            newCourse = Course()
            newCourse.setKnowledgeArea(associatedCourse.getKnowledgeArea())
            newCourse.setTitle(associatedCourse.getTitle())
            newCourse.setDescription(currentCourseDescription)
            coursesWithDescriptions.append(newCourse)
            courseDescriptionOn = 'false'
            ws.write(row, col, associatedCourse.getKnowledgeArea())
            col += 1
            ws.write(row, col, associatedCourse.getTitle())
            col += 1
            ws.write(row, col, currentCourseDescription)
            col = 0
            row += 1

            currentCourseDescription = ""
            courseTitlePosition = -1
        if p.getHeader().strip() != "" and courseDescriptionOn == 'false':
            courseDescriptionOn = 'true'
            associatedCourse = GetAssociatedCourse(p.getHeader().strip())
            courseTitlePosition = i
            currentCourseDescription += p.getText().strip()
        if courseDescriptionOn == 'true' and i > courseTitlePosition:
            currentCourseDescription += p.getText().strip()

    ws.set_column(0, 0, 35, default_format)
    ws.set_column(1, 1, 50, default_format)
    ws.set_column(2, 2, 120, description_format)
示例#26
0
def writeDocStyles():
    global number
    writeFile = "docStyles_fall.csv"
    wrawFile = "docStylesRaw_fall.csv"

    global document

    with open(writeFile, 'w') as output:
        fieldnames = ['number', 'style', 'text']
        writer = csv.DictWriter(output, fieldnames=fieldnames)

        writer.writeheader()
        for p in document.paragraphs:
            if p.text.strip() != "":
                number = number + 1
                paragraph = Paragraph()
                runCnt = 0
                for r in p.runs:
                    runCnt += 1
                    run = Run()
                    run.setBold(str(r.bold))
                    run.setFont(r.font.name)
                    run.setItalic(str(r.font.italic))
                    paragraph.setRuns(run)

                paragraph.setNumber(number)
                paragraph.setStyle(p.style.name)
                paragraph.setText(p.text.strip().rstrip(string.digits))
                paragraph.setRunCount(runCnt)
                paragraphs1.append(paragraph)

        number = 0
        i = 0

        for i, item in enumerate(paragraphs1):
            if item.getStyle() == constant.STYLE_BODY_TEXT:
                pPrev = paragraphs1[i - 1]
                if pPrev.getStyle() == constant.STYLE_NORMAL:
                    number = number + 1
                    p2 = Paragraph()
                    p2.setNumber(number)
                    p2.setStyle(pPrev.getStyle())
                    p2.setText(pPrev.getText())
                    paragraphs2.append(p2)
                number = number + 1
                pc = Paragraph()
                pc.setNumber(number)
                pc.setStyle(item.getStyle())
                pc.setText(item.getText().strip())
                paragraphs2.append(pc)

        for paragraph in paragraphs2:
            writer.writerow({
                'number': paragraph.getNumber(),
                'style': paragraph.getStyle(),
                'text': paragraph.getText()
            })

    with open(wrawFile, "w") as rawoutput:
        fieldnames = ['number', 'style', 'runs', 'font', 'bold', 'text']

        rwriter = csv.DictWriter(rawoutput, fieldnames=fieldnames)

        rwriter.writeheader()

        for paragraph in paragraphs1:
            for run in paragraph.getRuns():
                rwriter.writerow({
                    'number': paragraph.getNumber(),
                    'style': paragraph.getStyle(),
                    'runs': paragraph.getRunCount(),
                    'font': run.getFont(),
                    'bold': run.getBold(),
                    'text': paragraph.getText()
                })
示例#27
0
def parse_paragraphs(contents: Iterable[Line]) -> Iterator[Paragraph]:
    pp = Paragraph()
    for line in contents:
        pp.append_ahead(line)

        next_pp = pp.split_at_nomenclature()
        if next_pp:
            if not pp.is_empty():
                yield pp
            (retval, pp) = next_pp.next_paragraph()
            yield retval
            continue

        # New document triggers a new paragraph.
        if pp.last_line and pp.last_line.filename != line.filename:
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Page break triggers a new paragraph.
        if line.startswith(''):
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Page break is a whole paragraph.
        if pp.is_page_header():
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Leading tab triggers a new paragraph.
        if line.startswith('\t'):
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Tables start with a few long lines and
        # continue to grow as long as we have short lines.
        if pp.is_table():
            if line.is_short(pp.short_line):
                continue
            else:
                if pp.is_all_long():
                    continue
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Blocks of blank lines are a paragraph.
        if pp.is_blank():
            if line.is_blank():
                continue
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Figures end with a blank line, or period or colon at the end
        # of a line.
        if pp.is_figure():
            if (not line.is_blank() and
                not pp.detect_period() and
                not pp.endswith(':')):
                continue
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Leading hyphen triggers a new paragraph.
        if line.startswith('-'):
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # A table starts a new paragraph.
        if pp.next_line.is_table():
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # Synonymy reference ends a taxon.
        if pp.last_line and pp.last_line.search(r'\([Ss]yn.*\)$'):
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # A taxon ends in nov., nov. comb., nov. sp., ined.,
        # emend. (followed by emender), or nom. sanct.
        if pp.last_line and pp.last_line.search(
                r'(nov\.|nov\.\s?(comb\.|sp\.)|[(]?in\.?\s?ed\.[)]?|'
                r'[(]?nom\.\s?sanct\.[)]?|emend\..*)$'
        ):
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # A short line ends a paragraph.
        if pp.last_line and pp.last_line.is_short(pp.short_line):
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue

        # A blank line ends a paragraph.
        if line.is_blank():
            (retval, pp) = pp.next_paragraph()
            if not retval.is_empty():
                yield retval
            continue
示例#28
0
def main():
    args = define_args()

    Paragraph.set_reinterpretations(args.reinterpret)

    if args.dump_files:
        print('\ntraining_files:', args.training_files)
        print('\nevaluate_files:', args.evaluate_files)

    classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(max_iter=5, tol=None),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(max_iter=5, tol=-np.infty),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]
    vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

    fast_classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        # BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(max_iter=5, tol=None),
        RidgeClassifier(),
        # RidgeClassifierCV(),
        SGDClassifier(max_iter=5, tol=-np.infty),
        # OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        # KNeighborsClassifier()  # Actually not slow, but we run out of memory.
    ]
    fast_vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        # HashingVectorizer()
    ]

    if args.fast:
        classifiers = fast_classifiers
        vectorizers = fast_vectorizers
    try:
        i = [c.__class__.__name__ for c in classifiers].index(args.classifier)
    except ValueError:
        raise ValueError('Unknown classifier %s' % args.classifier)
    classifier = classifiers[i]

    try:
        i = [v.__class__.__name__ for v in vectorizers].index(args.vectorizer)
    except ValueError:
        raise ValueError('Unknown vectorizer %s' % args.vectorizer)
    vectorizer = vectorizers[i]

    if args.training_files:
        contents = read_files(args.training_files)

        if args.annotated_paragraphs:
            phase1 = parse_annotated(contents)
        else:
            phase1 = parse_paragraphs(contents)

        if 1 in args.dump_phase:
            print('Phase 1')
            print('=======')
            phase1 = list(phase1)
            print(repr(phase1))
            if 1 == max(args.dump_phase):
                sys.exit(0)

        if args.keep_interstitials:
            phase2 = phase1
        else:
            phase2 = remove_interstitials(phase1)
        phase1 = None  # Potentially recover memory.

        if 2 in args.dump_phase:
            print('Phase 2')
            print('=======')
            phase2 = list(phase2)
            print(repr(phase2))
            if 2 == max(args.dump_phase):
                sys.exit(0)

        # All labels need to be resolved for this phase. The easiest way
        # to assure this is to convert to list.
        phase3 = target_classes(
            list(phase2),
            default=Label('Misc-exposition'),
            keep=[Label(l) for l in args.labels]
        )

        if args.dump_input:
            phase3 = list(phase3)
            if args.output_annotated:
                if not args.output_labels:
                    print('\n'.join([pp.as_annotated() for pp in phase3]))
                else:
                    print('\n'.join([pp.as_annotated()
                                     for pp in phase3
                                     if pp.top_label() in args.output_labels]))
            else:
                print('\n'.join([str(pp) for pp in phase3]))

        phase2 = None

        if 3 in args.dump_phase:
            print('Phase 3')
            print('=======')
            phase3 = list(phase3)
            print(repr(phase3))
            if 3 == max(args.dump_phase):
                sys.exit(0)

        phase3 = list(phase3)
        sample_size = len(phase3)

        if args.group_paragraphs:
            writer = csv.DictWriter(sys.stdout, fieldnames=Taxon.FIELDNAMES)
            writer.writeheader()
            for taxon in group_paragraphs(phase3):
                for d in taxon.dictionaries():
                    writer.writerow(d)
            sys.exit(0)

        np.random.seed(SEED)
        cutoff = int(sample_size * 0.70)
        permutation = np.random.permutation(phase3)
        phase3 = None
        learn = paragraph.to_dataframe(permutation[:cutoff], args.suppress_text)
        test = paragraph.to_dataframe(permutation[cutoff:], args.suppress_text)

        if args.test_classifiers:
            perform(
                classifiers,
                vectorizers,
                learn,
                test
            )
            sys.exit(0)

        if args.test_classifiers_by_label:
            perform_confusion_matrix(
                classifiers,
                vectorizers,
                learn,
                test,
                emit_csv=args.csv
            )
            sys.exit(0)

    # train or load models
    if args.load_vectorizer:
        vectorizer = joblib.load(args.load_vectorizer)
        classifier = joblib.load(args.load_classifier)
    else:
        vectorize_text = vectorizer.fit_transform(learn.v2)
        classifier.fit(vectorize_text, learn.v1)

    # Dump trained models.
    if args.dump_vectorizer:
        joblib.dump(vectorizer, args.dump_vectorizer)
    if args.dump_classifier:
        joblib.dump(classifier, args.dump_classifier)

    if args.evaluate_files:
        phase4 = []
        # predict
        if args.keep_interstitials:
            evaluated = (
                parse_paragraphs(read_files(args.evaluate_files)))
        else:
            evaluated = remove_interstitials(
                parse_paragraphs(read_files(args.evaluate_files)))
        for pp in evaluated:
            text = str(pp)
            vectorize_text = vectorizer.transform([text])
            predict = classifier.predict(vectorize_text)[0]
            if args.insert_nomenclature and pp.contains_nomenclature():
                predict = 'Nomenclature'
            phase4.append(pp.replace_labels(labels=[Label(predict)]))


        if args.output_annotated:
            if not args.output_labels:
                print('\n'.join([pp.as_annotated() for pp in phase4]))
            else:
                print('\n'.join([pp.as_annotated()
                                 for pp in phase4
                                 if pp.top_label() in args.output_labels]))

    if 4 in args.dump_phase:
        print('Phase 4')
        print('=======')
        print(repr(phase4))
        if 4 == max(args.dump_phase):
            sys.exit(0)
示例#29
0
def ExtractCourseDescriptions(document, courses):

    courseDescriptionOn = 'false'
    allparagraphs = []
    pertinentParagraphs = []
    number = 0
    pHeader = ""
    headerIsCourseTitle = 'false'
    partialTitle = []
    fulltitle = ""

    for p in document.paragraphs:

        if p.text.strip() != "":

            pText = p.text.strip()
            lastPos = len(pText)

            if pText[lastPos - 1] == ":":
                partialTitle.append(pText)
            else:
                if len(partialTitle) > 0:
                    partialTitle.append(p.text.strip())
                    fulltitle = ' '.join(partialTitle)
                else:
                    fulltitle = pText

            number = number + 1
            paragraph = Paragraph()
            paragraph.setNumber(number)
            paragraph.setStyle(p.style.name)
            paragraph.setText(p.text.strip().rstrip(string.digits))
            allparagraphs.append(paragraph)

    for i, item in enumerate(allparagraphs):
        if item.getStyle() == constant.STYLE_BODY_TEXT:
            pPrev = allparagraphs[i - 1]
            pHeader = ""
            if pPrev.getStyle() == constant.STYLE_NORMAL:
                number = number + 1
                p2 = Paragraph()
                p2.setNumber(number)
                p2.setStyle(pPrev.getStyle())
                p2.setText(pPrev.getText())
                paragraphs2.append(p2)
                pHeader = pPrev.getText().strip()
                if pHeader != "":
                    headerIsCourseTitle = HeaderIsCourseTitle(pHeader)

            if headerIsCourseTitle == 'true':
                number = number + 1
                pc = Paragraph()
                pc.setNumber(number)
                pc.setStyle(item.getStyle())
                pc.setHeader(pHeader)
                pc.setText(item.getText())
                pertinentParagraphs.append(pc)

    associatedCourse = Course()
    currentCourseDescription = ""
    courseTitlePosition = -1

    for i, p in enumerate(pertinentParagraphs):
        courseTitleCandidate = p.getHeader().strip()
        #print("courseTitleCandidate: {}".format(courseTitleCandidate))
        if p.getHeader().strip(
        ) != "" and courseDescriptionOn == 'true' and i > courseTitlePosition:
            associatedCourse.setDescription(currentCourseDescription)
            courseDescriptionOn = 'false'
            currentCourseDescription = ""
            courseTitlePosition = -1
        if p.getHeader().strip() != "" and courseDescriptionOn == 'false':
            courseDescriptionOn = 'true'
            associatedCourse = GetAssociatedCourse(courses,
                                                   p.getHeader().strip())
            #print("return from getAssociatedCourse(): {}".format(associatedCourse.getTitle()))
            courseTitlePosition = i
            currentCourseDescription += p.getText().strip()
        if courseDescriptionOn == 'true' and i > courseTitlePosition:
            currentCourseDescription += p.getText().strip()
示例#30
0
        img = PIL.Image.open(join(img_dir, img_list[page]))
        print('image loaded')
        img: Image
        width = img.width
        name_font = load_font(
            join(
                current_dir,
                try_get(settings, 'name_font', 'fonts/FZY3JW.TTF',
                        quiet=True)),
            int(get_best_font_size(width) * name_scale_factor))
        word_font = load_font(
            join(
                current_dir,
                try_get(settings, 'word_font', 'fonts/FZY3JW.TTF',
                        quiet=True)), int(get_best_font_size(width)))
        paragraph = Paragraph(width - border_width * 2)
        for i in range(length):
            name = names[i]
            word = words[i]

            if name != '':
                name_color = try_get(name_color_dict,
                                     name,
                                     default_text_color,
                                     quiet=True)
                name_block = TextBlock(name + ': ', name_font, name_color)
                paragraph.add_text_block(name_block)
            else:
                name_color = default_text_color

            word_block = TextBlock(word, word_font, name_color)
示例#31
0
def ExtractCourseDescriptions(document: Document, courses: List[Course]) -> None:
    """
        Extract the course descriptions from the document.
        Write them to the appropriate Course records in the
        database.

        :param document: a Python-Docx Document object
        :param courses: a List of Course objects

    """

    courseDescriptionOn = False
    allparagraphs = []
    pertinentParagraphs = []
    number = 0
    pHeader = ""
    hIsCourseTitle = False
    partialTitle = []

    global courseNameStyles

    for p in document.paragraphs:

        if p.text.strip() != "":

            pText = p.text.strip()
            lastPos = len(pText)

            if pText[lastPos-1] == ":":
                partialTitle.append(pText)
            else:
                if len(partialTitle) > 0:
                    partialTitle.append(p.text.strip())

            number = number + 1
            paragraph = Paragraph()
            paragraph.setNumber(number)
            paragraph.setStyle(p.style.name)
            paragraph.setText(p.text.strip().rstrip(string.digits))
            allparagraphs.append(paragraph)

    for i, item in enumerate(allparagraphs):
        if item.getStyle() == constant.STYLE_BODY_TEXT:
            pPrev = allparagraphs[i-1]
            
            pHeader = ""
            if pPrev.getStyle() in courseNameStyles:
                number = number + 1
                p2 = Paragraph()
                p2.setNumber(number)
                p2.setStyle(pPrev.getStyle())
                p2.setText(pPrev.getText())
                paragraphs2.append(p2)
                pHeader = pPrev.getText().strip()
                if pHeader != "":
                    hIsCourseTitle = HeaderIsCourseTitle(pHeader)

            if hIsCourseTitle:
                number = number + 1
                pc = Paragraph()
                pc.setNumber(number)
                pc.setStyle(item.getStyle())
                pc.setHeader(pHeader)
                pc.setText(item.getText())
                pertinentParagraphs.append(pc)

    associatedCourse = Course()
    currentCourseDescription = ""
    courseTitlePosition = -1

    for i, p in enumerate(pertinentParagraphs):
        #if p.getHeader().strip() != "":
            #print("paragraph header: {}".format(p.getHeader().strip()))
            
        if p.getHeader().strip() != "" and courseDescriptionOn and i > courseTitlePosition:
            associatedCourse.setDescription(currentCourseDescription)
            courseDescriptionOn = False
            currentCourseDescription = ""
            courseTitlePosition = -1
        if p.getHeader().strip() != "" and not courseDescriptionOn:
            courseDescriptionOn = True
            associatedCourse = GetAssociatedCourse(courses, p.getHeader().replace("'","").strip())
            courseTitlePosition = i
            currentCourseDescription += p.getText().strip()
        if courseDescriptionOn and i > courseTitlePosition:
            currentCourseDescription += p.getText().strip()
示例#32
0
if __name__ == '__main__':
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    documents = []
    vocabulary = set()
    files = os.listdir('corpus')
    for i, file in enumerate(files):
        with open('corpus/' + file, encoding="utf8", errors='ignore') as f:
            raw = f.read()
            paras = paragraph_tokenizer(raw)
            paragraphs = []
            for j, para in enumerate(paras):
                # Preprocessing
                tokens = preprocessor(para)
                _id = (i, j)
                paragraph = Paragraph(_id, tokens)
                paragraphs.append(paragraph)
                for term in tokens:
                    vocabulary.add(term)
            document = Document(i, paragraphs)
            documents.append(document)

    # Length of vocabulary
    vocabularyLength = len(vocabulary)
    # print(vocabularyLength)
    
    # Creating the inverted index
    indexer = Indexer(documents)

    # Take filename as input for processing
    inputDocument = str(sys.argv[1])