Exemplo n.º 1
0
def getArticle(key_word, real_url_set):
    path = "C:/Users/Liuyus/Desktop/大停电爬虫/" + method + key_word
    os.mkdir(path)
    print(real_url_set)
    count = 0
    for real_url in real_url_set:
        try:
            time.sleep(1)
            g = goose3.Goose({
                'browser_user_agent':
                'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
                'stopwords_class': StopWordsChinese
            })
            article = g.extract(url=real_url)

            print(real_url)
            print(article.cleaned_text)
            filename = str(count) + ".txt"
            f = open(path + "/" + filename, "w")
            f.write(article.title)
            f.write(real_url)
            f.write(article.cleaned_text)
            f.close()
            count = count + 1
        except:
            continue
    return 0
Exemplo n.º 2
0
def getArticle(key_word, real_url_set):
    path = "C:/Users/Liuyus/Desktop/大停电爬虫/" + method + key_word
    os.mkdir(path)
    print(real_url_set)
    count = 0
    for real_url in real_url_set:
        try:
            time.sleep(1)
            g = goose3.Goose()
            try:
                driver.get(real_url)
            except:
                driver.refresh()
            time.sleep(2)
            driver.refresh()

            js = "var q=document.documentElement.scrollTop=100000"
            driver.execute_script(js)
            time.sleep(3)

            article = g.extract(raw_html=driver.page_source)
            print(driver.page_source)
            print(real_url)
            print(article.cleaned_text)
            filename = str(count) + ".txt"
            f = open(path + "/" + filename, "w")
            f.write(article.title)
            f.write(real_url)
            f.write(article.cleaned_text)
            f.close()
            count = count + 1
        except:
            continue
    return 0
def collect(label,top_headlines,n):
    for obj in top_headlines['articles']:
#        g = goose3.Goose({'enable_image_fetching': False})
        g = goose3.Goose()
        goose_obj = g.extract(url=obj['url'])
        body_text = goose_obj.cleaned_text
        if body_text :
            with open('/Users/siyangyin/Desktop/testSet/'+label+'/'+str(index[n])+'.txt','w') as f:
                f.write(body_text)
                index[n]+=1
Exemplo n.º 4
0
    def __init__(self, article):
        g = goose3.Goose({'enable_image_fetching': True})

        goose_obj = g.extract(raw_html=article.html)
        self.body_text = goose_obj.cleaned_text
        keywords = goose_obj.meta_keywords.split(',')
        self.keywords = [w.strip() for w in keywords]  # not actual keyw's
        self.title = goose_obj.title
        self.authors = goose_obj.authors
        self.top_img = goose_obj.top_image.src
        word_count = len(self.body_text.split())
        self.time = round(word_count / 200.0, 1)
Exemplo n.º 5
0
def parse(url, html_raw):
    """Parse raw html and extract metadata.

    :param url: article url
    :type url: str
    :param html_raw: article raw html
    :type html_raw: str
    :return: (meta_goose, meta_paper)
    :rtype: (dict, dict)
    """
    meta_goose = goose3.Goose().extract(raw_html=html_raw).infos
    meta_paper = _get_paper_meta(url, html_raw)

    return (meta_goose, meta_paper)
Exemplo n.º 6
0
    def __init__(self, cc=None, seedurls='seedurls', language='es', **kwargs):

        self.g=goose3.Goose()
        self.language=language

        # get ccTLDs from `-a` cmd line arg
        if cc is None:
            self.allowed_domains = ccTLDs
        else:
            self.allowed_domains = [cc]

        # load seed urls
        self.start_urls=[]
        for domain in self.allowed_domains:
            with open(seedurls+'/'+domain) as f:
                line=f.readline()
                while line:
                    parsed_uri = urlparse(line)
                    url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                    self.start_urls.append(url)
                    line=f.readline()

        # recursively init
        super().__init__(**kwargs)
Exemplo n.º 7
0
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
import goose3
import csv

g = goose3.Goose()

song_prefix = "Lyrics and music to all your favorite Christmas songs and Christmas carols"


class ChristmasSpider(CrawlSpider):
    custom_settings = {
        "RETRY_TIMES": 3,
        "CONCURRENT_REQUESTS": 10,
        "SCHEDULER_DISK_QUEUE": 'scrapy.squeue.PickleFifoDiskQueue',
        "SCHEDULER_MEMORY_QUEUE": 'scrapy.squeue.FifoMemoryQueue',
        "DEPTH_LIMIT": 6,
        "TELNETCONSOLE_PORT": None,
    }

    def __init__(self):
        self.name = "christmascrawler"
        self.allowed_domains = [
            "http://www.christmas-songs.org", "www.christmas-songs.org",
            "christmas-songs.org"
        ]
        self.start_urls = ["http://www.christmas-songs.org"]
        self._rules = [
            Rule(LinkExtractor(allow=(), ),
                 callback=self.parse_item,
def data_grammar_analyze(csv_data_file_name):
    original_df = pd.read_csv(
        csv_data_file_name, encoding="ISO-8859-1"
    )  # ISO encoding is just a standard way to avoid some errors.

    analysis_df = pd.DataFrame()
    analysis_df['URLs'] = 0
    analysis_df['Author'] = []
    analysis_df['Date'] = 0
    analysis_df['HeadLine'] = 0
    analysis_df['Body'] = 0

    df_row_count = 0  # accessing index when filling data in original_df, it avoids errors (notice at the end of code when it's used)
    for index, row in original_df.iterrows():  # iterate the df
        gooseErrorFlag = False  # flag stating if the goose article was successfully extracted.
        currURL = original_df.at[index, 'URLs']
        currURL = currURL.replace(
            '\r\n',
            '')  # remove the "\r\n" at the end of URL, it avoids errors.

        # filling newspaper parameters.
        newspaperArticle = newspaper.Article(currURL)
        newspaperArticle.download()
        if newspaperArticle.download_state == 2:  # checks validation of downloading: 2=downloaded, 1=unsuccessful download
            newspaperArticle.parse()
            newspaperAuthor = newspaperArticle.authors
            newspaperDate = newspaperArticle.publish_date
            newspaperHeadLine = newspaperArticle.title
            newspaperBody = newspaperArticle.text
        else:  # if article couldn't download, fill None in all parameters.
            newspaperAuthor = None
            newspaperDate = None
            newspaperHeadLine = None
            newspaperBody = None

        # filling goose parameters.
        goose = goose3.Goose()
        try:
            gooseArticle = goose.extract(
                url=currURL)  # trying to extract the URL
        except Exception as error:  # code enters the exception part only if trying wasn't successful.
            errorString = str(
                error
            )  # regular error is not iterable, thus convert to string.
            if '404' in errorString:  # if error occurred, fill in Nones + raise the flag to True.
                gooseAuthor = None
                gooseDate = None
                gooseHeadLine = None
                gooseBody = None
                gooseErrorFlag = True

        if gooseErrorFlag == False:  # if no errors, fill in the parameters.
            gooseAuthor = gooseArticle.authors
            gooseDate = gooseArticle.publish_date
            gooseHeadLine = gooseArticle.title
            gooseBody = gooseArticle.cleaned_text

        # fill in analysis_df with one of the libraries parameters, newspaper is chosen here arbitrary.
        analysis_df.loc[df_row_count, 'URLs'] = currURL
        analysis_df.loc[df_row_count, 'Author'] = str(
            newspaperAuthor
        )  # df can't have list as a value, thus convert to string.
        analysis_df.loc[df_row_count, 'Date'] = newspaperDate
        analysis_df.loc[df_row_count, 'HeadLine'] = newspaperHeadLine
        analysis_df.loc[df_row_count, 'Body'] = newspaperBody

        # following 4 IFs: if one of the parameters is empty or None, replace it with the relevant data from the second
        # library, as it might have better content.
        if ((analysis_df.loc[df_row_count, 'Author'] == None)
                or (analysis_df.loc[df_row_count, 'Author'] == '[]')):
            analysis_df.loc[df_row_count, 'Author'] = str(gooseAuthor)

        if (analysis_df.loc[df_row_count, 'Date'] == None):
            analysis_df.loc[df_row_count, 'Date'] = gooseDate

        if ((analysis_df.loc[df_row_count, 'HeadLine'] == None)
                or (analysis_df.loc[df_row_count, 'HeadLine'] == '')):
            analysis_df.loc[df_row_count, 'HeadLine'] = gooseHeadLine

        if ((analysis_df.loc[df_row_count, 'Body'] == None)
                or (analysis_df.loc[df_row_count, 'Body'] == '')):
            analysis_df.loc[df_row_count, 'Body'] = gooseBody

        df_row_count = df_row_count + 1

    # ------after reading all articles, start analyzing------

    initDataPerArticle = {}
    for index, row in original_df.iterrows():
        articleAttrs = {}

        # check only articles with a header and a body
        headerFlagError = False
        bodyFlagError = False
        currentURL = original_df.at[index, 'URLs']
        currentLabel = original_df.at[index, 'Label (1=true)']
        if currentLabel != '0' and currentLabel != '1':
            currentLabel = 'missing label'
        currentHeadLineList = original_df.at[index, 'Headline']
        currentBodyList = original_df.at[index, 'Body']
        if (currentHeadLineList
                == []) or (not (isinstance(currentHeadLineList, str))):
            headerFlagError = True
        if (currentBodyList == []) or (not (isinstance(currentBodyList, str))):
            bodyFlagError = True

        if (headerFlagError == False) and (
                bodyFlagError
                == False):  # relate only to articles with a header and a body
            currentHeadLineStr = re.sub("[^\w]", " ",
                                        currentHeadLineList).split()
            currentBodyStr = re.sub("[^\w]", " ", currentBodyList).split()

            # number of sentences, mean length, shortest and longest ones
            tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            HeadLineSentences = tokenizer.tokenize(currentHeadLineList)
            BodySentences = tokenizer.tokenize(currentBodyList)

            articleAttrs['numOfHeaderSentences'] = len(HeadLineSentences)
            articleAttrs['numOfBodySentences'] = len(BodySentences)

            meanSentencesCount = 0
            minHeaderLength = 999999999
            maxHeaderLength = 0
            for headerSentence in HeadLineSentences:
                words = re.sub("[^\w]", " ", headerSentence).split()
                sumToAdd = len(words)
                meanSentencesCount = meanSentencesCount + sumToAdd

                if len(words) != 0:
                    if len(words) < minHeaderLength:
                        minHeaderLength = len(words)

                    if len(words) > maxHeaderLength:
                        maxHeaderLength = len(words)

            articleAttrs['meanHeaderLen'] = meanSentencesCount / len(
                HeadLineSentences)
            articleAttrs['minHeaderLength'] = minHeaderLength
            articleAttrs['maxHeaderLength'] = maxHeaderLength

            meanSentencesCount = 0
            minBodyLength = 999999999
            maxBodyLength = 0
            for bodySentence in BodySentences:
                words = re.sub("[^\w]", " ", bodySentence).split()
                sumToAdd = len(words)
                meanSentencesCount = meanSentencesCount + sumToAdd

                if len(words) != 0:
                    if len(words) < minBodyLength:
                        minBodyLength = len(words)

                    if len(words) > maxBodyLength:
                        maxBodyLength = len(words)

            articleAttrs['meanBodyLen'] = meanSentencesCount / len(
                BodySentences)
            articleAttrs['minBodyLength'] = minBodyLength
            articleAttrs['maxBodyLength'] = maxBodyLength

            # check for misspells
            word_list = brown.words()
            word_set = set(word_list)
            headerMisspellCount = 0
            bodyMisspellCount = 0
            for headerWord in currentHeadLineStr:
                if headerWord not in word_set:
                    headerMisspellCount = headerMisspellCount + 1

            for bodyWord in currentBodyStr:
                if bodyWord not in word_set:
                    bodyMisspellCount = bodyMisspellCount + 1

            articleAttrs['headerMisspellRate'] = headerMisspellCount / len(
                currentHeadLineStr)
            articleAttrs['bodyMisspellRate'] = bodyMisspellCount / len(
                currentBodyStr)

            # check for word grammar category
            headerGrammarDict = {}
            for line in HeadLineSentences:
                tmp = nltk.word_tokenize(line)
                grammarPerLine = nltk.pos_tag(tmp)

                for pair in grammarPerLine:
                    grammarType = pair[1]

                    if grammarType not in headerGrammarDict.keys():
                        headerGrammarDict[grammarType] = 1
                    else:
                        tmpValue = headerGrammarDict[grammarType]
                        headerGrammarDict[grammarType] = tmpValue + 1

            bodyGrammarDict = {}
            for line in BodySentences:
                tmp = nltk.word_tokenize(line)
                grammarPerLine = nltk.pos_tag(tmp)

                for pair in grammarPerLine:
                    grammarType = pair[1]

                    if grammarType not in bodyGrammarDict.keys():
                        bodyGrammarDict[grammarType] = 1
                    else:
                        tmpValue = bodyGrammarDict[grammarType]
                        bodyGrammarDict[grammarType] = tmpValue + 1

            headerGrammarMaxKey = max(headerGrammarDict.keys(),
                                      key=(lambda k: headerGrammarDict[k]))
            headerGrammarMinKey = min(headerGrammarDict.keys(),
                                      key=(lambda k: headerGrammarDict[k]))
            headerGrammarMaxTimes = headerGrammarDict[headerGrammarMaxKey]
            headerGrammarMinTimes = headerGrammarDict[headerGrammarMinKey]
            articleAttrs['headerGrammarMaxKey'] = headerGrammarMaxKey
            articleAttrs['headerGrammarMaxTimes'] = headerGrammarMaxTimes
            articleAttrs['headerGrammarMinKey'] = headerGrammarMinKey
            articleAttrs['headerGrammarMinTimes'] = headerGrammarMinTimes

            bodyGrammarMaxKey = max(bodyGrammarDict.keys(),
                                    key=(lambda k: bodyGrammarDict[k]))
            bodyGrammarMinKey = min(bodyGrammarDict.keys(),
                                    key=(lambda k: bodyGrammarDict[k]))
            bodyGrammarMaxTimes = bodyGrammarDict[bodyGrammarMaxKey]
            bodyGrammarMinTimes = bodyGrammarDict[bodyGrammarMinKey]
            articleAttrs['bodyGrammarMaxKey'] = bodyGrammarMaxKey
            articleAttrs['bodyGrammarMaxTimes'] = bodyGrammarMaxTimes
            articleAttrs['bodyGrammarMinKey'] = bodyGrammarMinKey
            articleAttrs['bodyGrammarMinTimes'] = bodyGrammarMinTimes
            articleAttrs['label'] = currentLabel

        else:
            articleAttrs['numOfHeaderSentences'] = 0
            articleAttrs['numOfBodySentences'] = 0
            articleAttrs['meanHeaderLen'] = 0
            articleAttrs['minHeaderLength'] = 0
            articleAttrs['maxHeaderLength'] = 0
            articleAttrs['meanBodyLen'] = 0
            articleAttrs['minBodyLength'] = 0
            articleAttrs['maxBodyLength'] = 0
            articleAttrs['headerMisspellRate'] = 0
            articleAttrs['bodyMisspellRate'] = 0
            articleAttrs['headerGrammarMaxKey'] = 0
            articleAttrs['headerGrammarMaxTimes'] = 0
            articleAttrs['headerGrammarMinKey'] = 0
            articleAttrs['headerGrammarMinTimes'] = 0
            articleAttrs['bodyGrammarMaxKey'] = 0
            articleAttrs['bodyGrammarMaxTimes'] = 0
            articleAttrs['bodyGrammarMinKey'] = 0
            articleAttrs['bodyGrammarMinTimes'] = 0
            articleAttrs['label'] = 0

        initDataPerArticle[currentURL] = articleAttrs

    result_df = deepcopy(original_df)
    result_df['numOfHeaderSentences'] = 0
    result_df['numOfBodySentences'] = 0
    result_df['meanHeaderLen'] = 0
    result_df['minHeaderLength'] = 0
    result_df['maxHeaderLength'] = 0
    result_df['meanBodyLen'] = 0
    result_df['minBodyLength'] = 0
    result_df['maxBodyLength'] = 0
    result_df['headerMisspellRate'] = 0
    result_df['bodyMisspellRate'] = 0
    result_df['headerGrammarMaxKey'] = 0
    result_df['headerGrammarMaxTimes'] = 0
    result_df['headerGrammarMinKey'] = 0
    result_df['headerGrammarMinTimes'] = 0
    result_df['bodyGrammarMaxKey'] = 0
    result_df['bodyGrammarMaxTimes'] = 0
    result_df['bodyGrammarMinKey'] = 0
    result_df['bodyGrammarMinTimes'] = 0

    result_df_row_count = 0
    for url, attrs in initDataPerArticle.items():
        result_df.loc[result_df_row_count,
                      'numOfHeaderSentences'] = attrs['numOfHeaderSentences']
        result_df.loc[result_df_row_count,
                      'numOfBodySentences'] = attrs['numOfBodySentences']
        result_df.loc[result_df_row_count,
                      'meanHeaderLen'] = attrs['meanHeaderLen']
        result_df.loc[result_df_row_count,
                      'minHeaderLength'] = attrs['minHeaderLength']
        result_df.loc[result_df_row_count,
                      'maxHeaderLength'] = attrs['maxHeaderLength']
        result_df.loc[result_df_row_count,
                      'meanBodyLen'] = attrs['meanBodyLen']
        result_df.loc[result_df_row_count,
                      'minBodyLength'] = attrs['minBodyLength']
        result_df.loc[result_df_row_count,
                      'maxBodyLength'] = attrs['maxBodyLength']
        result_df.loc[result_df_row_count,
                      'headerMisspellRate'] = attrs['headerMisspellRate']
        result_df.loc[result_df_row_count,
                      'bodyMisspellRate'] = attrs['bodyMisspellRate']
        result_df.loc[result_df_row_count,
                      'headerGrammarMaxKey'] = attrs['headerGrammarMaxKey']
        result_df.loc[result_df_row_count,
                      'headerGrammarMaxTimes'] = attrs['headerGrammarMaxTimes']
        result_df.loc[result_df_row_count,
                      'headerGrammarMinKey'] = attrs['headerGrammarMinKey']
        result_df.loc[result_df_row_count,
                      'headerGrammarMinTimes'] = attrs['headerGrammarMinTimes']
        result_df.loc[result_df_row_count,
                      'bodyGrammarMaxKey'] = attrs['bodyGrammarMaxKey']
        result_df.loc[result_df_row_count,
                      'bodyGrammarMaxTimes'] = attrs['bodyGrammarMaxTimes']
        result_df.loc[result_df_row_count,
                      'bodyGrammarMinKey'] = attrs['bodyGrammarMinKey']
        result_df.loc[result_df_row_count,
                      'bodyGrammarMinTimes'] = attrs['bodyGrammarMinTimes']

        result_df_row_count = result_df_row_count + 1

    result_df = result_df.drop('Unnamed: 4', 1)
    result_df = result_df.drop('Unnamed: 5', 1)
    result_df.to_csv('data_grammar_analysis.csv',
                     encoding='utf-8',
                     index=False)
Exemplo n.º 9
0
# import signal
# class TimeoutException(Exception):   # Custom exception class
#     pass

# def timeout_handler(signum, frame):   # Custom signal handler
#     raise TimeoutException

# signal.signal(signal.SIGALRM, timeout_handler)


# from interruptingcow import timeout

import stopit

g = goose.Goose({'enable_image_fetching': False})

@stopit.threading_timeoutable(default='timeout')
def download_doc(url):
    global g
    # with timeout(5, exception=RuntimeError):
    headers = {"User-Agent": USER_AGENT}
    print("getting")
    resp = requests.get(url, headers=headers, timeout=0.5)
    print('got')
    # g = goose.Goose({'enable_image_fetching': False})
    print('parsing')
    doc = g.extract(raw_html=resp.text)
    # import time
    # time.sleep(20)
    print('parsed')