Exemplo n.º 1
0
    def __init__(self, *a, **arguments):
        super(CommonSpider, self).__init__(*a, **arguments)
        self.arguments = arguments

        #if self.is_other_spider_running():
        #    sys.exit('There is another running spider')
        #    return
        #self.write_pid_file()
        atexit.register(self.remove_pid_file)

        self.engine = db_connect()
        self.Session = sessionmaker(bind=self.engine)
Exemplo n.º 2
0
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from nltk.tokenize import word_tokenize

from filters import FootballCompoundNounFilter, FootballNounFilter, FootballCompoundNounFilter, FootballAliasFilter
from neo4j_util import get_word_aliases

import pickle
import argparse

session_maker = sessionmaker(bind=db_connect())
session = session_maker()

parser = argparse.ArgumentParser(
  prog="extract_content",
  usage="",
  description="extract content from article",
  epilog = "",
  add_help = True,
)

parser.add_argument("-n","--renew",
  action = "store_true"
)
args = parser.parse_args()
Exemplo n.º 3
0
parser = argparse.ArgumentParser(
    prog="extract_content",
    usage="",
    description="extract content from article",
    epilog="",
    add_help=True,
)

parser.add_argument("-n", "--renew", action="store_true")
parser.add_argument("-d", "--dryrun", action="store_true")
args = parser.parse_args()

m = Doc2Vec.load(script_dir + '/../../var/models/doc2vec.model')

engine = db_connect()

if args.renew:
    connection = engine.connect()
    result = connection.execute("TRUNCATE similar_articles")
    result = connection.execute(
        "UPDATE article_contents SET similar_article_calculated = 0")
    connection.close()

session_maker = sessionmaker(bind=engine)
session = session_maker()
session.expire_on_commit = False

results = session.query(Articles.hash).filter(
    Articles.hash == ArticleContents.article_hash,
    Articles.feed_id == Feeds.id,
Exemplo n.º 4
0
 def __init__(self):
     engine = db_connect()
     #engine.echo = True
     #create_items_table(engine)
     self.Session = sessionmaker(bind=engine)