Пример #1
0
    def doc_nc_lines(bill_number):
        lines = []
        query_string = """  SELECT `Art_Cod_Logico`, `DFA_Cantidad`, `DFA_PV_SinImp`, `DFA_Descuento`, `DFA_Monto_IV`,
        `DFA_Porc_IV`, `DFA_Precio_Venta` FROM DET_NDFact WHERE `NENC_ConsLogico` = ? """

        self.current_connection = AccessConnection()
        if self.current_connection.status:
            query_output, result = self.current_connection.run_query(
                query_string, (str(bill_number), ))
            if result:
                for counter, row in enumerate(query_output):
                    current_article = Article(row[0])
                    article_dictionary = current_article.get_article_data()
                    lines.append({
                        'numero_linea':
                        counter + 1,
                        'codigo':
                        row[0],
                        'cantidad':
                        row[1],
                        'detalle':
                        article_dictionary.get("description"),
                        'precio':
                        row[2],
                        'descuento':
                        row[3],
                        'impuesto':
                        row[4],
                        'porcentaje_impuesto':
                        row[5],
                        'total':
                        row[6],
                    })

        return lines
Пример #2
0
    def writeArticlesToFile(self):
        """ 
        Extracts and transforms each article from BeautifulSoup to an Article obj, then writes the article text to file
        """
        success = 0
        errors = 0
        
        base = "https://www.thedailystar.net/"

        for article in self.articles_raw:
            try:
                article_title = article.h5.text
                article_url = base + article.a['href']
                response = scraper.makeSpoofedRequest(article_url)
                article_soup = BeautifulSoup(response.text, 'html.parser')
                articleObj = Article(self.section_title, article_title,article_soup, self.date)
                articleObj.writeToFile()
                success += 1
            except Exception as e:             
                print(f"Error writing {article_title}")
                errors += 1
                
                continue
        
        print(f"Wrote {success} articles to file in {self.section_title}")
        print(f"Could not write {errors} articles.")
        
        self._log(success, errors)
Пример #3
0
def GetReferenceList(seedArticle, databaseFile = None, graphFile = None):
    global global_identification_value
    seedArticle.link = seedArticle.link.replace('articleDetails', 'abstractReferences')
    html =  GetHTMLFromLink(driver, seedArticle.link)
    references = GetReferencesFromHTML(html)
    articleList = []
    
    for ref in references:
        try :
            article = Article()
            article.title = GetTitleFromRef(ref)
            html = GetHTMLSearchIEEEByName(webdriver.Firefox(), article.title)
            article.link = GetSearchLinkFromArticleName(html, article.title)
            article.identification = parseIdentificationFromLink(article.link)
            
            
            articleList.append(article)
#            if (databaseFile is not None and graphFile is not None):
#                AppendDatabaseFromMap([article], databaseFile, graphFile)
            print article.identification
            print article.title
            print "\n"
        except :
            continue

    return articleList
Пример #4
0
 def read_articles(fn='Text/nhk_easy.txt', if_article=True, if_para=True, if_sentence=True):
     f = open(fn)
     articles ={}
     line_match = re.compile(r'(k\d{14})\s{4}(.*)\n')
     for line in f:
         match = line_match.match(line)
         if match:
             news_id = match.group(1)
             text = match.group(2)
             if if_article:
                 articles[news_id] = Article(news_id, text)
             if not if_para:
                 continue
             paras = re.split(' ',text)
             for pid in xrange(len(paras)):
                 news_para_id = news_id + '_para' + str(pid + 1)
                 if len(paras[pid].strip()) > 0:
                     articles[news_para_id] = Article(news_para_id, paras[pid].strip())
                     #print news_para_id, paras[pid]
                     if not if_sentence:
                         continue
                     sentences = re.split('。', paras[pid].strip())
                     for sid in xrange(len(sentences)):
                         news_para_sentence_id = news_para_id + '_s' + str(sid + 1)
                         if (len(sentences[sid].strip())) > 0:
                             articles[news_para_sentence_id] = Article(news_para_sentence_id, sentences[sid].strip() + '。')
                             #print news_para_sentence_id, sentences[sid].strip()
     return articles
Пример #5
0
def update_index(db):
    print('updating main page...')
    payload = {'country': 'US', 'apiKey': 'eb4ad8625c5b4f57bb62f8c95601038a'}
    r = requests.get('https://newsapi.org/v2/top-headlines', params=payload)
    raw_json = r.json()
    index_articles = db["articles"]
    index_articles.delete_many({'is_index': 1})
    for item in raw_json['articles']:
        try:
            article = Article(item['url'])
            article.build()
            index_articles.insert_one({
                'source': article.source_url,
                'title': article.title,
                'url': article.url,
                'topImage': article.topImage,
                'text': article.text,
                'keywords': article.keywords,
                'tags': article.tags,
                'category': article.category,
                'time': article.time,
                'is_index': 1
            })
        except:
            print('pass this article.')
    print('update finished!')
def preprocess_article(article: Article):
    ner_types = ["PERSON", "NORP", "ORG", "GPE", "LOC"]

    title = article.title
    title = re.sub("(?i)COVID-19", "coronavirus", title)
    title = re.sub("(?i)COVID19", "coronavirus", title)
    title = re.sub("(?i)COVID", "coronavirus", title)
    title = title.split()

    article.title_clean = " ".join([w for w in title if w not in STOP_WORDS])
    article.nlp_title_clean = nlp(article.title_clean)
    article.title_clean_lemmatized = lemmatizer(article.nlp_title_clean)

    text = article.text
    text = re.sub("(?i)COVID-19", "coronavirus", text)
    text = re.sub("(?i)COVID19", "coronavirus", text)
    text = re.sub("(?i)COVID", "coronavirus", text)
    text = text.split()

    article.text_clean = " ".join([w for w in text if w not in STOP_WORDS])
    # article.nlp_text_clean = nlp(article.text_clean)
    # article.text_clean_lemmatized = lemmatizer(article.nlp_text_clean)
    # article.title_text_named_entities = [ent.text for ent in article.nlp_title_clean.ents if ent.label_ in ner_types] + [ent.text for ent in article.nlp_text_clean.ents if ent.label_ in ner_types]

    return article
Пример #7
0
def LoadMapFromDatabase(databaseFileName, graphFileName):
    articleMap = {}
    #load database row by row, creating Article objects into the map
    with open(databaseFileName, 'r') as database:
        databasereader = csv.reader(database, delimiter=',',
                        quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in databasereader:
            if row is None or len(row)<3 or len(row[0])==0:
                continue
            article = Article()
            article.identification = int(row[0])
            article.title = row[1]
            article.link = row[2]
            articleMap[article.identification] = article

    #load references
    with open(graphFileName, 'r') as graph:
        graphreader = csv.reader(graph, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in graphreader:
            if len(row) < 2:
                continue
            articleMap[int(row[0])].references = []
            for x in row[1:]:
                if len(x)>0 and int(x) in articleMap.keys():
                    articleMap[int(row[0])].references.append(articleMap[int(x)])
    #return map
    return articleMap
Пример #8
0
    def setUp(self):
        """called before the first test case of this unit begins"""
        self.article = Article(
            'www.cnn.com/2018/09/25/health/iyw-girl-named-florence-collects-donations-trnd/index.html'
        )

        self.invalid_article = Article('i am an invalid string')
Пример #9
0
    def addPosting(self,file='',REQUEST=None,RESPONSE=None, index=1):
        """ add an article """
        
        id=self.createId()     
     
        msg=Article(id)
        err, sage = msg.__of__(self)._validation(REQUEST,RESPONSE,'delete attachment',file)
        if err:
            return err

        # Set thread number. 
        msg.tnum = '1'

        self.ids.insert(id)     
        self.data[id]=msg

        if index:
            msg.__of__(self).index()

        if RESPONSE:
            return self.showMessage(self, REQUEST=REQUEST, 
                                title='Article Posted',     
                                message  ='Your article has been posted',
                                action=self.absolute_url()
                                )

        return id
Пример #10
0
 def parsingNews(self, url):
     ret = []
     links = self.parseLinkArtcle(url)
     for v in links:
         (title, text) = self.news.crawling(v)
         art = Article(self.getStringFilter(title), self.getStringFilter(text), "한국경제")
         print("title : ", title)
         print("text : ", text)
         ret.append(art.toDic())
     return ret
Пример #11
0
    def get_article(self, title, url):
        ''' Return an Article object after examining the article for company information '''

        try:
            company_url, company_name = self.parse_article(url)
            article = Article(title, url, company_name, company_url)
        except (AttributeError, TypeError) as e:
            article = Article(title, url)

        return article
Пример #12
0
def seg_to_article(segment):
    pattern1 = re.compile(r"<DOCNO>(.*?)</DOCNO>")
    doc_id = ''.join(pattern1.findall(segment))
    pattern2 = re.compile(r'<TEXT>(.*?)</TEXT>', re.M | re.S)
    doc_text = ''.join(pattern2.findall(segment))
    art = Article(doc_id, doc_text)

    if doc_id == '' or doc_text == '':
        print art.to_string()
        print segment

    return art
Пример #13
0
 def parsingNews(self, url):
     ret = []
     links = self.parseLinkArtcle(self.url1)
     for v in links:
         try:
             (title, text) = self.news.crawling(v)
             print('title : ', title)
             print("text : ", text)
             art = Article(self.util.getStringFilter(title),
                           self.util.getStringFilter(text), "매일경제")
             ret.append(art.toDic())
         except:
             print('new crawling error ')
     return ret
Пример #14
0
def convert_text_to_articles(fn='Text/nhk_easy.txt',
                             if_article=True,
                             if_para=True,
                             if_sentence=True):

    old_articles = read_articles()

    f = open(fn)
    articles = {}
    line_match = re.compile(r'(k\d{14})\s{4}(.*)\n')
    for line in f:
        match = line_match.match(line)
        if match:
            news_id = match.group(1)
            text = match.group(2)
            if if_article:
                articles[news_id] = Article(news_id, text)
            if not if_para:
                continue
            paras = re.split(' ', text)
            for pid in xrange(1, len(paras)):
                news_para_id = news_id + '_para' + str(pid)
                if len(paras[pid].strip()) > 0:
                    articles[news_para_id] = Article(news_para_id,
                                                     paras[pid].strip())
                    # print news_para_id, paras[pid]
                    if not if_sentence:
                        continue
                    sentences = re.split('。', paras[pid].strip())
                    for sid in xrange(len(sentences)):
                        news_para_sentence_id = news_para_id + '_s' + str(sid +
                                                                          1)
                        if (len(sentences[sid].strip())) > 0:
                            articles[news_para_sentence_id] = Article(
                                news_para_sentence_id,
                                sentences[sid].strip() + '。')
                            # print news_para_sentence_id, sentences[sid].strip()

    ##############################################
    # Keep old_articles, combine them into new one
    for doc_id in old_articles.keys():
        if not articles.has_key(doc_id):
            articles[doc_id] = old_articles[doc_id]
    ##############################################

    f = codecs.open('Text/nhk_easy_articles.txt', 'w', 'utf-8')

    for article in articles.values():
        f.write(json.dumps(article.__dict__) + '\n')
    f.close()
Пример #15
0
class ArticleTestCase(unittest.TestCase):
    def runTest(self):
        self.test_url()
        self.test_source_url()
        self.test_download_html()
        self.test_parse_html()

    def setUp(self):
        """called before the first test case of this unit begins"""
        self.article = Article(
            'www.cnn.com/2018/09/25/health/iyw-girl-named-florence-collects-donations-trnd/index.html'
        )

    def tearDown(self):
        """called after all test cases finish of this unit"""
        pass

    def test_url(self):
        assert self.article.url == '/2018/09/25/health/iyw-girl-named-florence-collects-donations-trnd/index.html'

    def test_source_url(self):
        assert self.article.source_url == 'http://www.cnn.com'
        request = requests.get(self.article.source_url + self.article.url)
        assert request.status_code == 200

    def test_download_html(self):
        self.article.download()

        assert len(self.article.html) > 5000

    def test_parse_html(self):
        """check whether parser function can use GooseObj correctly"""
        TOP_IMG = 'https://cdn.cnn.com/cnnnext/dam/assets/180925092633-03-iyw-wisniewski-trnds-large-169.jpg'
        TITLE = "4-year-old Florence didn't like sharing her name with a bad hurricane. So she did something about it."
        KEYWORDS = [
            'health',
            "4-year-old Florence didn't like sharing her name with a bad hurricane. So she did something about it. - CNN"
        ]
        AUTHOR = ['Christopher Dawson, CNN']
        self.article.download()
        self.article.parse()
        assert self.article.top_image == TOP_IMG
        assert self.article.title == TITLE
        assert self.article.keywords == KEYWORDS
        # assert self.article.author == AUTHOR

    def test_time(self):
        self.article.download()
        self.article.parse()
        assert self.article.time == 2.5
Пример #16
0
    def parsingNews(self, url):
        ret = []
        links = self.parseLinkArtcle(url)
        for v in links:
            (title, text) = self.news.crawling(v)
            text = text.replace("저작권자 © 시사경제신문 무단전재 및 재배포 금지", "")
            art = Article(self.getStringFilter(title),
                          self.getStringFilter(text), "시사경제")
            print("title : ", title)
            print("text : ", text)

            ret.append(art.toDic())
            print(art.toDic())
        return ret
Пример #17
0
 def parse_xml(self, context):
     if 'Journal' in context: #to check if url contains an XML with article
         abstract = context[context.index("<Abstract>")+10:context.index("</Abstract>")]
         title = context[context.index("<ArticleTitle>")+14:context.index("</ArticleTitle>")]
         return Article(abstract, title)
     else:
         return None
Пример #18
0
def parse_article(url):
     # given a url, get page content
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    data = urlopen(req).read()
    # parse as html structured document
    bs = BeautifulSoup(data,'html.parser')
    # kill javascript content
    # print (bs.prettify())
    for s in bs.findAll('script'):
        s.replaceWith('')

    new_article = Article(bs)
    new_article.add_author()
    new_article.add_title()

    return new_article
Пример #19
0
    def parse_article(self):
        title = self.title()
        author = self.author_name()
        url = self.article_url
        tag_list = self.tags()

        return Article(title, author, url, tag_list)
Пример #20
0
def process_articles():
    pages = 0
    # wiki_categories = obtain_categories(config_db.get("host"), config_db.get("db"), config_db.get("user"),
    #                                     config_db.get("passwd"),
    #                                     min=config_categories.get("articles_min"),
    #                                     max=config_categories.get("articles_max"))

    # file = open("./test_articles.txt", "w")
    with open(dataset, "r") as data:
        for line in data:
            elements = line.split(";")
            article_id = elements[0]
            article_title = elements[1]
            categories_part = elements[3][1:-2]
            categories_part = categories_part.replace("\'", "")
            article_categories = [
                category.strip() for category in categories_part.split(",")
            ]
            # article_categories = [c for c in elements[3] if c in wiki_categories]

            if article_categories:
                article = Article(id=article_id, categories=article_categories)
                process_article(article, article_title)
                # linea = str.format("{} ; {} ; {}\n", article_title, str(clean_text(article_title)), str(article_categories))
                # file.write(linea)
                del article

            pages += 1
            if pages % 10000 == 0:
                print("Processed pages = ", pages)
Пример #21
0
def query_candidates(doc):
    min = MinHash(num_perm=128)
    keyword = doc.keyword.split(",")
    for k in keyword:
        time.sleep(2)
        # print(k)
        trans_text = translate_yandex(str(k), src="vi",
                                      dest="en").encode("utf-8")
        print(trans_text)
        min.update(trans_text)
    # result = forest.query(min, 3)
    result = lsh.query(min)
    result = ",".join(result)
    if not result:
        print(doc.title)
        print("----------------------------------------------")
        print("Not found")
        print("\n")
    else:
        docs = mydb.execute_query(
            "SELECT id, keyword, title FROM english WHERE id IN (" + result +
            ")")
        titles = [
            Article(id=item[0], keyword=item[1], title=item[2])
            for item in docs
        ]
        print(doc.title)
        print("----------------------------------------------")
        for i in titles:
            print(i.title)
        print("\n")
Пример #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--mode', '-m', help='work mode\n 1 for build mode \n 2 for find mode \n default None', default=None)
    parser.add_argument(
        '--config', help='use local config', dest='config', default=True, action='store_true')
    parser.add_argument('--no-config', help='not use local config',
                        dest='config', action='store_false')
    parser.add_argument('--file','-f', help="not working")
    parser.add_argument('--floder','-F', help='not working')
    args = parser.parse_args()
    CONFIG = safe_IO.load_json('./FAIDK.config')
    FLAG = safe_IO.check_flag(args.mode)
    if FLAG == 'q':
        logger.info('user exit')
        return
    safe_IO.check_output_file(CONFIG['MAIN_PATH'] + CONFIG['NEW_WORDS_PATH'])
    # NEM_WORDS_ALL = set()
    # if args.file is not None:
        # FILE_NAMES = [args.file]
        # logger.info('using args.file: '+','.join(FILE_NAMES))
    # else:
    FILE_NAMES = safe_IO.get_name(
        CONFIG['MAIN_PATH'] + CONFIG['ARTICLES_PATH'])
    safe_IO.try_make_dir(CONFIG['MAIN_PATH'] + CONFIG['OLD_ARTICLES_PATH'])
    for file in FILE_NAMES:
        # article = 
        Article(CONFIG, file, FLAG)
Пример #23
0
def get_articles():
    articles = []

    links = []
    headers = []
    topics = []
    authors = []
    dates = []

    for i in range(0, 4032, 29):
        curr_url = BASE_URL + '/?start=' + str(i)
        response = urllib.request.urlopen(curr_url)
        html = response.read().decode('utf-8')
        soup = BeautifulSoup(html, 'lxml')

        headers, topics = get_headers_and_topics(soup, headers, topics)
        authors += get_authors(soup)
        dates += get_dates(soup)

    # modifier
    headers, links = make_headers_and_links(headers, links)

    for i, link in enumerate(links):
        try:
            articles.append(
                Article(link, headers[i], topics[i], authors[i], dates[i]))
        except:
            print('i:', i, 'link:', link, 'end of the link')
            print(headers[i])
            print(len(headers), len(topics), len(authors), len(dates))
            print(authors[i - 1])
            __import__('sys').stdout.flush()

    return articles
Пример #24
0
    def extract_article(self, block):
        title = block.getText()

        if title:
            return Article(title)

        return None
Пример #25
0
    def scrape_latest_updates(self):
        """
        scrapes all articles from
        the 'Latest Updates' section
        """
        articles = []

        soup = bs(self.driver.page_source, 'html.parser')
        soup = soup.find("h2", {"id": "latest-updates"}).find_parent('div')
        elements = soup.find_all("li", {"class": "lx-stream__post-container"})
        for el_soup in elements:
            url = self.get_url(
                el_soup.find("a", {"class": "qa-story-cta-link"}))
            if url is None or url[0:6] != '/news/':
                continue
            title = self.get_text(
                el_soup.find("h3", {"class": "lx-stream-post__header-title"}))
            text = self.get_text(
                el_soup.find("p",
                             {"class": "lx-stream-related-story--summary"}))
            date = self.get_date(
                el_soup.find("span", {"class": "qa-visually-hidden-meta"}))
            img = self.get_src(
                el_soup.find(
                    "img", {"class": "lx-stream-related-story--index-image"}))

            # FILTER articles to be scrapped
            # We will only take articles which have a url (to scrap a full content only)
            # and that are on the news section of the website
            articles.append(Article(title, text, date, url, img))

        return articles
Пример #26
0
def parse_article(article):
    # Get article date
    date_div = article.find('div', class_='views-field-created-1')
    date_txt = date_div.find('div',
                             class_='post-day').text + ' ' + date_div.find(
                                 'div', class_='post-month').text

    # Get title
    title_div = article.find('div', class_='views-field-title')
    title_txt = title_div.find('a').text

    # Get author
    author_div = article.find('div', class_='views-field-name')
    author_txt = author_div.find('span', class_='field-content').text

    # Get body
    body_div = article.find('div', class_='views-field-body')
    body_txt = body_div.find('p').text

    # Get link
    link_div = article.find('div', class_='views-field-view-node')
    link_txt = link_div.find('a').text
    link_href = link_div.find('a')['href']

    # Return Article object
    return Article(date_txt, title_txt, author_txt, body_txt, link_href)
Пример #27
0
def add_articles_to_current_clusters(API_URL,
                                     selected_ungrouped_article_id_list):
    news_groups = requests.get(
        f"{API_URL}/news?should_get_articles_and_id_only=true").json()
    news_groups_with_preprocessed_articles = []

    for ng in news_groups:
        news_group_with_preprocessed_articles = []
        for article_id in ng["articles"]:
            news_group_with_preprocessed_articles.append(
                preprocess_article(
                    Article(
                        requests.get(
                            f"{API_URL}/articles/{article_id}").json())))
        news_groups_with_preprocessed_articles.append(
            news_group_with_preprocessed_articles)

    ungrouped_article_ids = requests.get(
        f"{API_URL}/articles/?is_grouped=false&should_get_features_for_preprocessing=true"
    ).json()
    ungrouped_articles = [
        Article(a) for a in ungrouped_article_ids
        if a['_id'] in selected_ungrouped_article_id_list
    ]
    ungrouped_preprocessed_articles = [
        preprocess_article(a) for a in ungrouped_articles
    ]

    updated_news_group_ids = set()
    for i in range(len(news_groups_with_preprocessed_articles)):
        for x in ungrouped_preprocessed_articles:
            if x.is_grouped or is_news_not_belongs_to_group(
                    x, news_groups_with_preprocessed_articles[i]):
                continue
            print(f'Has same topic, {news_groups[i]["_id"]}, {x.id}')
            updated_news_group_ids.add(news_groups[i]["_id"])
            news_groups[i]["articles"].append(x.id)
            x.is_grouped = True
    print(f'updated_news_group_ids: {list(updated_news_group_ids)}')

    for ng in news_groups:
        ng_id = ng["_id"]
        if ng_id in updated_news_group_ids:
            requests.put(f"{API_URL}/news/{ng_id}",
                         json={"articles": ng["articles"]})

    return list(updated_news_group_ids)
Пример #28
0
 def get_articles(self):
     column_name = self.get_column_name()
     scroll_loader = ScrollLoader(
         "get", "http://zhuanlan.zhihu.com/api/columns/" + column_name +
         "/posts?limit=10", 10)
     from Article import Article
     for response in scroll_loader.run():
         yield Article("http://zhuanlan.zhihu.com" + response)
Пример #29
0
 def __init__(self, url, headers=None, request=None, soup=None):
     self.url = url
     self.headers = headers
     self.request = requests.get(self.url, headers=self.headers)
     self.soup = BeautifulSoup(self.request.content, 'html.parser')
     self.articles = [
         Article(article_html)
         for article_html in self.soup.find_all('article')
     ]
Пример #30
0
def StartFromSeed(seedLink, seedTitle):  
    SeedArticle = Article()
    SeedArticle.link = seedLink
    SeedArticle.title = seedTitle
    SeedArticle.identification = parseIdentificationFromLink(SeedArticle.link)
    
    databasefile = "database.csv"
    graphfile = "graph.csv"
    
    SeedArticle.references = GetReferenceList(SeedArticle, databaseFile = databasefile, graphFile = graphfile)
    mapToInsert = {}
    mapToInsert[SeedArticle.identification] = SeedArticle
    for art in SeedArticle.references:
        mapToInsert[art.identification] = art
        
    AppendDatabaseFromMap(mapToInsert, databasefile, graphfile)

    print 'done'
Пример #31
0
def insertIDF(table_store):
  articles = mydb.execute_query("SELECT id, content, word FROM " + table_store)
  list_articles = [Article(item[0], item[1], item[2]) for item in articles]
  contents = [item.content for item in list_articles]
  tf = TfidfVectorizer(use_idf=True)
  tf.fit_transform(contents)
  idf = tf.idf_
  pdb.set_trace()
  mydb.insert("INSERT INTO idf (" + str(table_store) + ") VALUES (%s)", (idf))
Пример #32
0
def counter(table, word = False):
    articles = mydb.execute_query("SELECT id, content, word, keyword FROM " + table)
    list_articles = [Article(item[0], item[1], item[2], item[3]) for item in articles]
    if(word):
        contents = [item.word for item in list_articles]
    else:
        contents = [item.content for item in list_articles]
    cv = CountVectorizer(stop_words = get_stopwords(table))
    return cv, cv.fit_transform(contents)
Пример #33
0
    def test_get_article_without_company_returns_article_obj(self):
        expected_article = Article(
            url=self.article_with_no_company,
            title=self.article_no_co_title
        )

        ret = self.parser.get_article(self.article_no_co_title, self.article_with_no_company)

        self.assertEqual(ret, expected_article)
Пример #34
0
def parseListHtml(page, titleindex):
    next_page = {'page': page, 'title': titleindex}
    common.save_now_page(next_page)
    mysql = Mysql()
    s = ''
    if page > 1:
        s = '_' + repr(page)
    print(url.format(titles[titleindex], s))
    try:
        response = requests.get(url.format(titles[titleindex], s),
                                headers=headers,
                                timeout=10)
        response.encoding = 'gb2312'
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            re_coms = soup.find_all('ul', attrs={'class': 'recom_list'})
            articles = []
            for re_com in re_coms:
                article = Article(re_com.a.string, re_com.find('span', attrs={'class': 'gd1'}).a.attrs['href'])
                article.author = 'OK学车'
                article.contentHead = parseContentHead(re_com.find('li', attrs={'class': 'recom_nr'}).text)
                article.type = types[titles[titleindex]]
                articles.append(article)
            parseArticle(articles)
            # 保存到数据库
            mysql.insert_array(articles)
            mysql.close()
            # common.save_file(titles[titleIndex], '第{0}页'.format(page), repr(common.convert_to_dicts(articles)))
            sleep_time = random.randint(5, 10)
            print('休息', sleep_time, 's后再获取')
            time.sleep(sleep_time)
            parseListHtml(page + 1, titleindex)
        else:
            mysql.close()
            if titleindex + 1 < len(titles):
                parseListHtml(1, titleindex + 1)
    except Exception as e:
        print(traceback.format_exc())
        print('网页获取失败:', e)
        mysql.close()
        sleep_time = random.randint(1, 5)
        print(repr(sleep_time), 's后重新获取')
        time.sleep(sleep_time)
        parseListHtml(page + 1, titleindex)
Пример #35
0
    def get_doc_lines(self, bill_number):
        lines = []
        query_string = """  SELECT `Art_Cod_Logico`, `DFA_Cantidad`, `DFA_PV_SinImp`, `DFA_Descuento`, `DFA_Monto_IV`, `DFA_Porc_IV`, 
        `DFA_Precio_Venta`, `DFA_Porc_Exoneracion` FROM DET_FACTURA WHERE `Fenc_Numero` = ?  """
        self.current_connection = AccessConnection()
        if self.current_connection.status:
            query_output, result = self.current_connection.run_query(
                query_string, (str(bill_number), ))
            if result:
                for counter, row in enumerate(query_output):
                    current_article = Article(row[0])
                    article_dictionary = current_article.get_article_data()
                    lines.append({
                        'numero_linea':
                        counter + 1,
                        'codigo':
                        row[0],
                        'cantidad':
                        row[1],
                        'detalle':
                        article_dictionary.get("description"),
                        'precio':
                        row[2],
                        'descuento':
                        row[3],
                        'impuesto':
                        row[4],
                        'porcentaje_impuesto':
                        row[5],
                        'total':
                        row[6],
                        'porcentaje_exoneracion':
                        row[7],
                        'unidad':
                        article_dictionary.get("unit"),
                        'codigo_impuesto':
                        article_dictionary.get("iva_code"),
                        'tarifa_impuesto':
                        article_dictionary.get("iva_tarif"),
                        'cabys':
                        article_dictionary.get('cabys')
                    })

        return lines
Пример #36
0
def init():
    global driver, soup, page_type, page, website
    driver = webdriver.Chrome()
    soup = ""
    page_type = ""
    page = Page()
    page = Article()
    page = YoutubePage()
    page = YoutubeVideo()
    website = ""
Пример #37
0
 def generateXML(self):
     
     """
     Create the XML File
     """
     document = Document()
     
     home = document.createElement("SmartCruizerData")
     document.appendChild(home)
     
     for element in self.ArticleList.values():
         
         Article = document.createElement("Article")
         home.appendChild(Article)
         
         """
         Heading
         """
         heading = document.createElement("Heading")
         Article.appendChild(heading)
         
         headingValue = document.createTextNode(element.getHeading())
         heading.appendChild(headingValue)
         
         """
         ShortText
         """
         shortText = document.createElement("ShortText")
         Article.appendChild(shortText)
         
         shortTextValue = document.createTextNode(element.getShortText())
         shortText.appendChild(shortTextValue)
         
         """
         Thumbnail
         """
         Thumbnail = document.createElement("Thumbnail")
         Article.appendChild(Thumbnail)
         
         ThumbnailValue = document.createTextNode(element.getThumbnail())
         Thumbnail.appendChild(ThumbnailValue)
         
         """
         Text
         """
         text = document.createElement("Text")
         Article.appendChild(text)
         
         textValue = document.createTextNode(element.getText())
         text.appendChild(textValue)
         
     return document.toprettyxml(indent="", encoding="utf-8")
Пример #38
0
    else:
        xapian_name = options.n

    #set PSQL database name
    database = options.d
    #set synonym path
    synonymPath = options.s

    #Synonym file to use
    if not (os.path.isfile(synonymPath)):
        sys.exit( "synonym file not existing - programme terminates" )

    if options.x:
        #import class Article from Article.py and connect to PostgreSQL database
        from Article import Article
        Article.getConnection(database)
        #select all articles in a range of years x >= b_year and x <= e_year
        articles = Article.getArticlesByYear(b_year,e_year)
        Article.closeConnection()
        print "\n-------------"
        print "processing files from year " + str(b_year) + " to " + str(e_year)
        print "-------------"
        print "got articles from PostgreSQL database"
        print "-------------"
    #take the last year to create directory
    indexer  = PubMedXapian(xapian_name, xapianPath = options.xapian_database_path)
    #build full text index with Xapian for all articles selected before
    if options.x:
       print "now indexing articles in Xapian"
       indexer.buildIndexWithArticles(articles)
       print "\n-------------"
Пример #39
0
    check_call(["mkdir", image_folder])

    pages = list()
    page_count = 0
    for pdf_page in PDFPage.create_pages(document):
        interpreter.process_page(pdf_page)
        layout = device.get_result()
        page = Page(layout, page_number=page_count+1, jpg=page_images[page_count])
        page.find_segment_top_neighbors()
        pages.append( page )
        page_count += 1


    fp.close()

    pdfArticle = Article(pages, pdf_name)
    pdfArticle.find_default_fonts()
    pdfArticle.find_content_distances()
    pdfArticle.save_content(style="lines")
    pdfArticle.concatenate_segments()
    pdfArticle.identify_num_columns()
    pdfArticle.identify_sections()
    pdfArticle.save_images(image_folder)

    if xml_file != "":
        if label_mode == "A" or label_mode == "a":
            pdfArticle.assign_labels(xml_file)
            pdfArticle.print_label_accuracy()
        else:
            feature_vecs = XML_Parser.retrieve_tags(xml_file)
            feature_vecs.sort(key=lambda x:x[1])
Пример #40
0
    def convertToArticle(self, article):
        """Takes the parsed Pubmed article and converts it to our lightweight format.
		
		:param article: A parsed Pubmed article
		:rtype: A properly formatted Article Object ( Stored Object )
		"""
        articleObject = Article()

        # Loop over all the citation data
        medlineCitation = article["MedlineCitation"]
        for attr in medlineCitation:
            if attr == "PMID":
                articleObject.addAttribute("id", "pubmed", str(medlineCitation["PMID"]))

            if attr == "DateCreated":
                day = medlineCitation["DateCreated"]["Day"]
                month = medlineCitation["DateCreated"]["Month"]
                year = medlineCitation["DateCreated"]["Year"]
                dateCreated = self.convertDateToNative(day, month, year)
                articleObject.addAttribute("dateCreated", "pubmed", dateCreated)

            if attr == "DateCompleted":
                day = medlineCitation["DateCompleted"]["Day"]
                month = medlineCitation["DateCompleted"]["Month"]
                year = medlineCitation["DateCompleted"]["Year"]
                dateCompleted = self.convertDateToNative(day, month, year)
                articleObject.addAttribute("dateCompleted", "pubmed", dateCompleted)

            if attr == "Article":
                articleObject.addAttribute("title", "pubmed", medlineCitation["Article"]["ArticleTitle"].encode("utf8"))
                articleObject.addAttribute(
                    "abstract", "pubmed", medlineCitation["Article"]["Abstract"]["AbstractText"][0].encode("utf8")
                )

                authors = []
                for author in medlineCitation["Article"]["AuthorList"]:
                    authors.append("%s %s" % (author["ForeName"], author["LastName"]))
                articleObject.addAttribute("authors", "pubmed", authors)

                articleObject.addAttribute("source", "pubmed", str(medlineCitation["Article"]["Journal"]["Title"]))

                # Loop over all the aspects of the pubmed data
        for attr in article["PubmedData"]:
            if attr == "ArticleIdList":
                for id in article["PubmedData"]["ArticleIdList"]:
                    articleObject.addAttribute("id", id.attributes["IdType"], str(id))

        return articleObject