def __init__(self, *a, **arguments): super(CommonSpider, self).__init__(*a, **arguments) self.arguments = arguments #if self.is_other_spider_running(): # sys.exit('There is another running spider') # return #self.write_pid_file() atexit.register(self.remove_pid_file) self.engine = db_connect() self.Session = sessionmaker(bind=self.engine)
from janome.analyzer import Analyzer from janome.charfilter import * from janome.tokenfilter import * from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import TaggedDocument from nltk.tokenize import word_tokenize from filters import FootballCompoundNounFilter, FootballNounFilter, FootballCompoundNounFilter, FootballAliasFilter from neo4j_util import get_word_aliases import pickle import argparse session_maker = sessionmaker(bind=db_connect()) session = session_maker() parser = argparse.ArgumentParser( prog="extract_content", usage="", description="extract content from article", epilog = "", add_help = True, ) parser.add_argument("-n","--renew", action = "store_true" ) args = parser.parse_args()
parser = argparse.ArgumentParser( prog="extract_content", usage="", description="extract content from article", epilog="", add_help=True, ) parser.add_argument("-n", "--renew", action="store_true") parser.add_argument("-d", "--dryrun", action="store_true") args = parser.parse_args() m = Doc2Vec.load(script_dir + '/../../var/models/doc2vec.model') engine = db_connect() if args.renew: connection = engine.connect() result = connection.execute("TRUNCATE similar_articles") result = connection.execute( "UPDATE article_contents SET similar_article_calculated = 0") connection.close() session_maker = sessionmaker(bind=engine) session = session_maker() session.expire_on_commit = False results = session.query(Articles.hash).filter( Articles.hash == ArticleContents.article_hash, Articles.feed_id == Feeds.id,
def __init__(self): engine = db_connect() #engine.echo = True #create_items_table(engine) self.Session = sessionmaker(bind=engine)